diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-06 17:16:08 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-06 17:16:08 +0200 |
commit | 01bcb80ac096de72694135dff37e2ff70c2ab572 (patch) | |
tree | 453cd7da8b1cdb67bee7a1eb1d450d94db6239ea /src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp | |
parent | e59855fd87bea3641846d6b589059230b08043f1 (diff) | |
download | crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.gz crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.bz2 |
first steps to make URL loader loadable
Diffstat (limited to 'src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp')
-rw-r--r-- | src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp new file mode 100644 index 0000000..e5810d6 --- /dev/null +++ b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp @@ -0,0 +1,117 @@ +#include "GoogleURLNormalizer.hpp" + +#include <string> + +#include "url_util.h" +#include "url_canon_stdstring.h" +#include "url_parse.h" + +using namespace std; +using namespace url_util; +using namespace url_canon; +using namespace url_parse; + +GoogleURLNormalizer::GoogleURLNormalizer( ) +{ + Initialize( ); +} + +GoogleURLNormalizer::~GoogleURLNormalizer( ) +{ + Shutdown( ); +} + +static string componentString( const string &s, const Component &comp ) +{ + if( comp.len <= 0 ) { + return string( ); + } else { + return string( s, comp.begin, comp.len ); + } +} + +URL GoogleURLNormalizer::parseUrl( const string s ) +{ + string canonical; + canonical.reserve( s.size( ) + 32 ); + StdStringCanonOutput output( &canonical ); + Parsed parsed; + bool success = Canonicalize( + s.data( ), static_cast<int>( s.length( ) ), + NULL, &output, &parsed ); + if( !success ) { + return URL::Null; + } + output.Complete( ); + + unsigned short port; + if( parsed.port.len >= 0 ) { + port = (unsigned short)atoi( + componentString( canonical, parsed.port ).c_str( ) ); + } else { + port = URL::defaultPort( + componentString( canonical, parsed.scheme ) ); + } + + return URL( componentString( canonical, parsed.scheme ), + componentString( canonical, parsed.host ), + port, + componentString( canonical, parsed.path ), + "", "" ); +} + +URL GoogleURLNormalizer::normalize( const URL url, const string s ) +{ + string urlstr = url.str( ); + string urlCanonical; + urlCanonical.reserve( urlstr.size( ) + 32 ); + StdStringCanonOutput urlOutput( &urlCanonical ); + Parsed urlParsed; + bool success = Canonicalize( + urlstr.data( ), static_cast<int>( urlstr.length( ) ), + NULL, &urlOutput, &urlParsed ); + if( !success ) { + return URL::Null; + } + urlOutput.Complete( ); + + string canonical; + canonical.reserve( urlstr.size( ) + s.size( ) + 32 ); + StdStringCanonOutput output( &canonical ); + Parsed parsed; + success = ResolveRelative( + urlstr.data( ), static_cast<int>( urlstr.length( ) ), urlParsed, + s.data( ), static_cast<int>( s.length( ) ), + NULL, &output, &parsed ); + if( !success ) { + return URL::Null; + } + output.Complete( ); + + unsigned short port; + if( parsed.port.len >= 0 ) { + port = (unsigned short)atoi( + componentString( canonical, parsed.port ).c_str( ) ); + } else { + port = URL::defaultPort( + componentString( canonical, parsed.scheme ) ); + } + + return URL( componentString( canonical, parsed.scheme ), + componentString( canonical, parsed.host ), + port, + componentString( canonical, parsed.path ), + "", "" ); +} + +static URLNormalizer *create( ) +{ + return new GoogleURLNormalizer( ); +} + +static void destroy( URLNormalizer *obj ) +{ + delete obj; +} + +ModuleRegistry<URLNormalizer> registry( "google", &create, &destroy ); |