#include "GoogleURLNormalizer.hpp" //#include #include "url_util.h" #include "url_canon_stdstring.h" #include "url_parse.h" #include "Unused.hpp" #ifdef WITH_LUA #include "tolua.h" #include "GoogleURLNormalizerLua.hpp" #include "LuaVM.hpp" #endif using namespace std; using namespace url_util; using namespace url_canon; using namespace url_parse; GoogleURLNormalizer::GoogleURLNormalizer( ) { Initialize( ); } GoogleURLNormalizer::~GoogleURLNormalizer( ) { Shutdown( ); } static string componentString( const string &s, const Component &comp ) { if( comp.len <= 0 ) { return string( ); } else { return string( s, comp.begin, comp.len ); } } URL GoogleURLNormalizer::parseUrl( const string s ) { string canonical; canonical.reserve( s.size( ) + 32 ); StdStringCanonOutput output( &canonical ); Parsed parsed; bool success = Canonicalize( s.data( ), static_cast( s.length( ) ), NULL, &output, &parsed ); if( !success ) { return URL::Null; } output.Complete( ); unsigned short port; if( parsed.port.len >= 0 ) { port = (unsigned short)atoi( componentString( canonical, parsed.port ).c_str( ) ); } else { port = URL::defaultPort( componentString( canonical, parsed.scheme ) ); } return URL( componentString( canonical, parsed.scheme ), componentString( canonical, parsed.host ), port, componentString( canonical, parsed.path ), componentString( canonical, parsed.query ), "" ); } URL GoogleURLNormalizer::normalize( const URL url, const string s ) { string urlstr = url.str( ); string urlCanonical; urlCanonical.reserve( urlstr.size( ) + 32 ); StdStringCanonOutput urlOutput( &urlCanonical ); Parsed urlParsed; bool success = Canonicalize( urlstr.data( ), static_cast( urlstr.length( ) ), NULL, &urlOutput, &urlParsed ); if( !success ) { return URL::Null; } urlOutput.Complete( ); string canonical; canonical.reserve( urlstr.size( ) + s.size( ) + 32 ); StdStringCanonOutput output( &canonical ); Parsed parsed; success = ResolveRelative( urlstr.data( ), static_cast( urlstr.length( ) ), urlParsed, s.data( ), static_cast( s.length( ) ), NULL, &output, &parsed ); if( !success ) { return URL::Null; } output.Complete( ); unsigned short port; if( parsed.port.len >= 0 ) { port = (unsigned short)atoi( componentString( canonical, parsed.port ).c_str( ) ); } else { port = URL::defaultPort( componentString( canonical, parsed.scheme ) ); } return URL( componentString( canonical, parsed.scheme ), componentString( canonical, parsed.host ), port, componentString( canonical, parsed.path ), componentString( canonical, parsed.query ), "" ); } static void initModule( CRAWLER_UNUSED( void *user_data ) ) { #ifdef WITH_LUA LuaVM *luaVm = (LuaVM *)user_data; tolua_GoogleURLNormalizer_open( luaVm->handle( ) ); #endif } static void destroyModule( void * /* user_data */ ) { } REGISTER_MODULE( "google_urlnormalizer", &initModule, &destroyModule, URLNormalizer, GoogleURLNormalizer )