diff options
-rw-r--r-- | src/GNUmakefile | 10 | ||||
-rw-r--r-- | src/GoogleURLNormalizer.cpp | 72 | ||||
-rw-r--r-- | src/GoogleURLNormalizer.hpp | 26 | ||||
-rw-r--r-- | tests/googleurl/GNUmakefile | 3 | ||||
-rw-r--r-- | tests/googleurl/test1.cpp | 38 |
5 files changed, 120 insertions, 29 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile index 4abdd22..eaf57c8 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -10,11 +10,14 @@ INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ -I$(TOPDIR)/libfetch \ - -I$(TOPDIR)/streamhtmlparser + -I$(TOPDIR)/streamhtmlparser \ + -I$(TOPDIR)/googleurl INCLUDE_LIBS = \ $(TOPDIR)/libfetch/libfetch.a \ - $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a + $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a \ + $(TOPDIR)/googleurl/libgoogleurl.a \ + -licui18n -licuuc # openssl ifeq ($(WITH_SSL),1) @@ -36,7 +39,8 @@ LOCAL_STATIC_LIB_OBJS = \ HostURLFilter.o \ ChainURLFilter.o \ MemoryURLSeen.o \ - SimpleURLNormalizer.o + SimpleURLNormalizer.o \ + GoogleURLNormalizer.o CPP_OBJS = \ $(LOCAL_STATIC_LIB_OBJS) diff --git a/src/GoogleURLNormalizer.cpp b/src/GoogleURLNormalizer.cpp new file mode 100644 index 0000000..c49a831 --- /dev/null +++ b/src/GoogleURLNormalizer.cpp @@ -0,0 +1,72 @@ +#include "GoogleURLNormalizer.hpp" + +#include <string> + +#include "url_util.h" +#include "url_canon_stdstring.h" +#include "url_parse.h" + +using namespace std; +using namespace url_util; +using namespace url_canon; +using namespace url_parse; + +GoogleURLNormalizer::GoogleURLNormalizer( ) +{ + Initialize( ); +} + +GoogleURLNormalizer::~GoogleURLNormalizer( ) +{ + Shutdown( ); +} + +string GoogleURLNormalizer::componentString( const string &s, const Component &comp ) const +{ + if( comp.len <= 0 ) { + return string( ); + } else { + return string( s, comp.begin, comp.len ); + } +} + +URL GoogleURLNormalizer::parseUrl( const string s ) +{ + if( s.empty( ) ) { + return URL::Null; + } + + string canonical; + canonical.reserve( s.size( ) + 32 ); + StdStringCanonOutput output( &canonical ); + Parsed parsed; + bool success = Canonicalize( + s.data( ), static_cast<int>( s.length( ) ), + NULL, &output, &parsed ); + if( !success ) { + return URL::Null; + } + output.Complete( ); + + unsigned short port; + if( parsed.port.len >= 0 ) { + port = (unsigned short)atoi( + componentString( canonical, parsed.port ).c_str( ) ); + } else { + port = URL::defaultPort( + componentString( canonical, parsed.scheme ) ); + } + + return URL( componentString( canonical, parsed.scheme ), + componentString( canonical, parsed.host ), + port, + componentString( canonical, parsed.path ), + "", "" ); +} + +URL GoogleURLNormalizer::normalize( const URL url, const string s ) +{ + (void)url; + (void)s; + return URL::Null; +} diff --git a/src/GoogleURLNormalizer.hpp b/src/GoogleURLNormalizer.hpp new file mode 100644 index 0000000..d630d5f --- /dev/null +++ b/src/GoogleURLNormalizer.hpp @@ -0,0 +1,26 @@ +#ifndef __GOOGLEURLNORMALIZER_H +#define __GOOGLEURLNORMALIZER_H + +#include "URLNormalizer.hpp" + +//TODO: will fix later, bad include here! +#include "url_parse.h" + +class GoogleURLNormalizer : public URLNormalizer { + public: + GoogleURLNormalizer( ); + + virtual ~GoogleURLNormalizer( ); + + virtual URL parseUrl( const std::string s ); + + virtual URL normalize( const URL url, const std::string s ); + + private: + //TODO: hide implementation details here (PIMPL) or don't + //allocate Normalizers, use a factory method (as this is + //anyway better for loadable module support!) + std::string componentString( const std::string &s, const url_parse::Component &comp ) const; +}; + +#endif diff --git a/tests/googleurl/GNUmakefile b/tests/googleurl/GNUmakefile index 387a9f2..cd7ba6a 100644 --- a/tests/googleurl/GNUmakefile +++ b/tests/googleurl/GNUmakefile @@ -2,12 +2,15 @@ TOPDIR = ../.. SUBDIRS = +#TODO: hide include dependency on googleurl here! INCLUDE_DIRS = \ + -I$(TOPDIR)/src \ -I$(TOPDIR)/googleurl INCLUDE_LDFLAGS = INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a \ $(TOPDIR)/googleurl/libgoogleurl.a \ -licui18n -licuuc diff --git a/tests/googleurl/test1.cpp b/tests/googleurl/test1.cpp index a5b069d..278be5e 100644 --- a/tests/googleurl/test1.cpp +++ b/tests/googleurl/test1.cpp @@ -1,43 +1,30 @@ -#include "url_util.h" -#include "url_canon_stdstring.h" -#include "url_parse.h" +#include "URL.hpp" +#include "GoogleURLNormalizer.hpp" #include <iostream> #include <string> -using namespace url_util; -using namespace url_canon; -using namespace url_parse; using namespace std; int main( int argc, char *argv[] ) { - Initialize( ); - if( argc != 2 ) { cerr << "usage: test1 <url>\n" << endl; return 1; } - string urlstring = argv[1]; - string canonical; - canonical.reserve( urlstring.size( ) + 32 ); - StdStringCanonOutput output( &canonical ); - Parsed parsed; - bool success = Canonicalize( - urlstring.data( ), static_cast<int>( urlstring.length( ) ), - NULL, &output, &parsed ); - if( !success ) { + char *urlstring = argv[1]; + + URLNormalizer *normalizer = new GoogleURLNormalizer( ); + URL url = normalizer->parseUrl( urlstring ); + delete normalizer; + + if( url == URL::Null ) { cerr << "Illegal URL!" << endl; return 1; } - output.Complete( ); - - cout << "URL: " << canonical << endl; - - Shutdown( ); - -/* cout << "protocol: " << url.protocol( ) << endl + + cout << "protocol: " << url.protocol( ) << endl << "host: " << url.host( ) << endl << "port: " << url.port( ) << endl << "path: " << url.path( ) << endl @@ -45,7 +32,6 @@ int main( int argc, char *argv[] ) << "fragment: " << url.fragment( ) << endl; cout << "URL: " << url << endl; -*/ - + return 0; } |