summaryrefslogtreecommitdiff
path: root/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-06 17:16:08 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-06 17:16:08 +0200
commit01bcb80ac096de72694135dff37e2ff70c2ab572 (patch)
tree453cd7da8b1cdb67bee7a1eb1d450d94db6239ea /src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp
parente59855fd87bea3641846d6b589059230b08043f1 (diff)
downloadcrawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.gz
crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.bz2
first steps to make URL loader loadable
Diffstat (limited to 'src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp')
-rw-r--r--src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp117
1 files changed, 117 insertions, 0 deletions
diff --git a/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp
new file mode 100644
index 0000000..e5810d6
--- /dev/null
+++ b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp
@@ -0,0 +1,117 @@
+#include "GoogleURLNormalizer.hpp"
+
+#include <string>
+
+#include "url_util.h"
+#include "url_canon_stdstring.h"
+#include "url_parse.h"
+
+using namespace std;
+using namespace url_util;
+using namespace url_canon;
+using namespace url_parse;
+
+GoogleURLNormalizer::GoogleURLNormalizer( )
+{
+ Initialize( );
+}
+
+GoogleURLNormalizer::~GoogleURLNormalizer( )
+{
+ Shutdown( );
+}
+
+static string componentString( const string &s, const Component &comp )
+{
+ if( comp.len <= 0 ) {
+ return string( );
+ } else {
+ return string( s, comp.begin, comp.len );
+ }
+}
+
+URL GoogleURLNormalizer::parseUrl( const string s )
+{
+ string canonical;
+ canonical.reserve( s.size( ) + 32 );
+ StdStringCanonOutput output( &canonical );
+ Parsed parsed;
+ bool success = Canonicalize(
+ s.data( ), static_cast<int>( s.length( ) ),
+ NULL, &output, &parsed );
+ if( !success ) {
+ return URL::Null;
+ }
+ output.Complete( );
+
+ unsigned short port;
+ if( parsed.port.len >= 0 ) {
+ port = (unsigned short)atoi(
+ componentString( canonical, parsed.port ).c_str( ) );
+ } else {
+ port = URL::defaultPort(
+ componentString( canonical, parsed.scheme ) );
+ }
+
+ return URL( componentString( canonical, parsed.scheme ),
+ componentString( canonical, parsed.host ),
+ port,
+ componentString( canonical, parsed.path ),
+ "", "" );
+}
+
+URL GoogleURLNormalizer::normalize( const URL url, const string s )
+{
+ string urlstr = url.str( );
+ string urlCanonical;
+ urlCanonical.reserve( urlstr.size( ) + 32 );
+ StdStringCanonOutput urlOutput( &urlCanonical );
+ Parsed urlParsed;
+ bool success = Canonicalize(
+ urlstr.data( ), static_cast<int>( urlstr.length( ) ),
+ NULL, &urlOutput, &urlParsed );
+ if( !success ) {
+ return URL::Null;
+ }
+ urlOutput.Complete( );
+
+ string canonical;
+ canonical.reserve( urlstr.size( ) + s.size( ) + 32 );
+ StdStringCanonOutput output( &canonical );
+ Parsed parsed;
+ success = ResolveRelative(
+ urlstr.data( ), static_cast<int>( urlstr.length( ) ), urlParsed,
+ s.data( ), static_cast<int>( s.length( ) ),
+ NULL, &output, &parsed );
+ if( !success ) {
+ return URL::Null;
+ }
+ output.Complete( );
+
+ unsigned short port;
+ if( parsed.port.len >= 0 ) {
+ port = (unsigned short)atoi(
+ componentString( canonical, parsed.port ).c_str( ) );
+ } else {
+ port = URL::defaultPort(
+ componentString( canonical, parsed.scheme ) );
+ }
+
+ return URL( componentString( canonical, parsed.scheme ),
+ componentString( canonical, parsed.host ),
+ port,
+ componentString( canonical, parsed.path ),
+ "", "" );
+}
+
+static URLNormalizer *create( )
+{
+ return new GoogleURLNormalizer( );
+}
+
+static void destroy( URLNormalizer *obj )
+{
+ delete obj;
+}
+
+ModuleRegistry<URLNormalizer> registry( "google", &create, &destroy );