#include #include #include "SimpleURLNormalizer.hpp" #include "Unused.hpp" #ifdef WITH_LUA #include "tolua.h" #include "SimpleURLNormalizerLua.hpp" #include "LuaVM.hpp" #endif using namespace std; SimpleURLNormalizer::SimpleURLNormalizer( ) { } SimpleURLNormalizer::~SimpleURLNormalizer( ) { } URL SimpleURLNormalizer::parseUrl( const string s ) { if( s.empty( ) ) { return URL::Null; } // protocol string protocol; string::const_iterator protocolStart = s.begin( ); string::const_iterator protocolEnd = find( protocolStart, s.end( ), ':' ); if( protocolEnd == s.end( ) ) { // no protocol separator ':', not really legal return URL::Null; } protocol = &*protocolEnd; if( protocol.length( ) < 3 || protocol.substr( 0, 3 ) != "://" ) { // no protocol, not really legal return URL::Null; } protocol = string( protocolStart, protocolEnd ); protocolEnd += 3; // host string host; string::const_iterator hostStart = protocolEnd; string::const_iterator pathStart = find( hostStart, s.end( ), '/' ); string::const_iterator hostEnd = find( protocolEnd, pathStart, ':' ); host = string( hostStart, hostEnd ); // port unsigned short port = URL::defaultPort( protocol ); if( hostEnd != s.end( ) && *hostEnd == ':' ) { hostEnd++; string::const_iterator portEnd = pathStart; string portStr = string( hostEnd, portEnd ); port = (unsigned short)atoi( portStr.c_str( ) ); } // path string path; if( pathStart != s.end( ) ) { path = string( pathStart, s.end( ) ); } else { // add trailing slash if path is empty path = "/"; } // TODO: fragment string fragment; // TODO: query string query; return URL( protocol, host, port, path, query, fragment ); } /* * protocol: * down vote favorite 2 I would like to ask if there's any Java package or library that have the standard URL normalization? 5 Components of URL Representation http://www[dot]example[dot]com:8040/folder/exist?name=sky#head scheme: http authority: www.example.com:8040 path: /folder/exist query: ?name=sky fragment: #head The 3 types of standard URL normalization Syntax-Based Normalization Case normalization – convert all letter at scheme and authority components to lower case Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’ domains: https://github.com/john-kurkowski/tldextract * http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url * http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform * */ URL SimpleURLNormalizer::normalize( const URL url, const string s ) { // See if the URL is parseable, if so it is an absolute URL URL absUrl = parseUrl( s ); if( absUrl != URL::Null ) { return absUrl; } // the new path starts with a slash, so it's absolute, ignore // the old path if( s[0] == '/' ) { return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" ); } // relative links have path, query and fragment only, try to // append them cleverly string oldPath = url.path( ); // find out the directory of the base path size_t found = oldPath.rfind( "/" ); if( !found ) { // no directory at all, just a file, so the new path // is returned return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" ); } oldPath.erase( found ); string path = oldPath; path.append( "/" ); // append the one from s path.append( s ); // normalize sequences of "/", "." and ".." normalizePath( path ); return URL( url.protocol( ), url.host( ), url.port( ), path, "", "" ); } void SimpleURLNormalizer::normalizePath( string &path ) { size_t found; found = path.find( "./" ); while( found != string::npos ) { path.replace( found, 2, "" ); found = path.find( "./" ); } } static void initModule( CRAWLER_UNUSED( void *user_data ) ) { #ifdef WITH_LUA LuaVM *luaVm = (LuaVM *)user_data; tolua_SimpleURLNormalizer_open( luaVm->handle( ) ); #endif } static void destroyModule( void * /* user_data */ ) { } REGISTER_MODULE( "simple_urlnormalizer", &initModule, &destroyModule, URLNormalizer, SimpleURLNormalizer )