diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-04 20:30:07 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-04 20:30:07 +0200 |
commit | 39116e2604cc4059828ae0bca694c38a52fef810 (patch) | |
tree | e8eee7f209cbe5a9b2c4cc60223ccf494d6e8221 /src | |
parent | 79abc6b0891223fa1c9c0f57769cd58e562f22f1 (diff) | |
download | crawler-39116e2604cc4059828ae0bca694c38a52fef810.tar.gz crawler-39116e2604cc4059828ae0bca694c38a52fef810.tar.bz2 |
rearanged google test1 and added a GoogleUrlNormalizer
Diffstat (limited to 'src')
-rw-r--r-- | src/GNUmakefile | 10 | ||||
-rw-r--r-- | src/GoogleURLNormalizer.cpp | 72 | ||||
-rw-r--r-- | src/GoogleURLNormalizer.hpp | 26 |
3 files changed, 105 insertions, 3 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile index 4abdd22..eaf57c8 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -10,11 +10,14 @@ INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ -I$(TOPDIR)/libfetch \ - -I$(TOPDIR)/streamhtmlparser + -I$(TOPDIR)/streamhtmlparser \ + -I$(TOPDIR)/googleurl INCLUDE_LIBS = \ $(TOPDIR)/libfetch/libfetch.a \ - $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a + $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a \ + $(TOPDIR)/googleurl/libgoogleurl.a \ + -licui18n -licuuc # openssl ifeq ($(WITH_SSL),1) @@ -36,7 +39,8 @@ LOCAL_STATIC_LIB_OBJS = \ HostURLFilter.o \ ChainURLFilter.o \ MemoryURLSeen.o \ - SimpleURLNormalizer.o + SimpleURLNormalizer.o \ + GoogleURLNormalizer.o CPP_OBJS = \ $(LOCAL_STATIC_LIB_OBJS) diff --git a/src/GoogleURLNormalizer.cpp b/src/GoogleURLNormalizer.cpp new file mode 100644 index 0000000..c49a831 --- /dev/null +++ b/src/GoogleURLNormalizer.cpp @@ -0,0 +1,72 @@ +#include "GoogleURLNormalizer.hpp" + +#include <string> + +#include "url_util.h" +#include "url_canon_stdstring.h" +#include "url_parse.h" + +using namespace std; +using namespace url_util; +using namespace url_canon; +using namespace url_parse; + +GoogleURLNormalizer::GoogleURLNormalizer( ) +{ + Initialize( ); +} + +GoogleURLNormalizer::~GoogleURLNormalizer( ) +{ + Shutdown( ); +} + +string GoogleURLNormalizer::componentString( const string &s, const Component &comp ) const +{ + if( comp.len <= 0 ) { + return string( ); + } else { + return string( s, comp.begin, comp.len ); + } +} + +URL GoogleURLNormalizer::parseUrl( const string s ) +{ + if( s.empty( ) ) { + return URL::Null; + } + + string canonical; + canonical.reserve( s.size( ) + 32 ); + StdStringCanonOutput output( &canonical ); + Parsed parsed; + bool success = Canonicalize( + s.data( ), static_cast<int>( s.length( ) ), + NULL, &output, &parsed ); + if( !success ) { + return URL::Null; + } + output.Complete( ); + + unsigned short port; + if( parsed.port.len >= 0 ) { + port = (unsigned short)atoi( + componentString( canonical, parsed.port ).c_str( ) ); + } else { + port = URL::defaultPort( + componentString( canonical, parsed.scheme ) ); + } + + return URL( componentString( canonical, parsed.scheme ), + componentString( canonical, parsed.host ), + port, + componentString( canonical, parsed.path ), + "", "" ); +} + +URL GoogleURLNormalizer::normalize( const URL url, const string s ) +{ + (void)url; + (void)s; + return URL::Null; +} diff --git a/src/GoogleURLNormalizer.hpp b/src/GoogleURLNormalizer.hpp new file mode 100644 index 0000000..d630d5f --- /dev/null +++ b/src/GoogleURLNormalizer.hpp @@ -0,0 +1,26 @@ +#ifndef __GOOGLEURLNORMALIZER_H +#define __GOOGLEURLNORMALIZER_H + +#include "URLNormalizer.hpp" + +//TODO: will fix later, bad include here! +#include "url_parse.h" + +class GoogleURLNormalizer : public URLNormalizer { + public: + GoogleURLNormalizer( ); + + virtual ~GoogleURLNormalizer( ); + + virtual URL parseUrl( const std::string s ); + + virtual URL normalize( const URL url, const std::string s ); + + private: + //TODO: hide implementation details here (PIMPL) or don't + //allocate Normalizers, use a factory method (as this is + //anyway better for loadable module support!) + std::string componentString( const std::string &s, const url_parse::Component &comp ) const; +}; + +#endif |