diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-06 17:16:08 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-06 17:16:08 +0200 |
commit | 01bcb80ac096de72694135dff37e2ff70c2ab572 (patch) | |
tree | 453cd7da8b1cdb67bee7a1eb1d450d94db6239ea /src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp | |
parent | e59855fd87bea3641846d6b589059230b08043f1 (diff) | |
download | crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.gz crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.bz2 |
first steps to make URL loader loadable
Diffstat (limited to 'src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp')
-rw-r--r-- | src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp new file mode 100644 index 0000000..328a82b --- /dev/null +++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp @@ -0,0 +1,159 @@ +#include <string> +#include <algorithm> + +#include "SimpleURLNormalizer.hpp" + +using namespace std; + +SimpleURLNormalizer::SimpleURLNormalizer( ) +{ +} + +URL SimpleURLNormalizer::parseUrl( const string s ) +{ + if( s.empty( ) ) { + return URL::Null; + } + + // protocol + string protocol; + string::const_iterator protocolStart = s.begin( ); + string::const_iterator protocolEnd = find( protocolStart, s.end( ), ':' ); + if( protocolStart == s.end( ) ) { + // no protocol separator ':', not really legal + return URL::Null; + } + protocol = &*protocolEnd; + if( protocol.length( ) < 3 || protocol.substr( 0, 3 ) != "://" ) { + // no protocol, not really legal + return URL::Null; + } + protocol = string( protocolStart, protocolEnd ); + protocolEnd += 3; + + // host + string host; + string::const_iterator hostStart = protocolEnd; + string::const_iterator pathStart = find( hostStart, s.end( ), '/' ); + string::const_iterator hostEnd = find( protocolEnd, pathStart, ':' ); + host = string( hostStart, hostEnd ); + + // port + unsigned short port = URL::defaultPort( protocol ); + if( hostEnd != s.end( ) && *hostEnd == ':' ) { + hostEnd++; + string::const_iterator portEnd = pathStart; + string portStr = string( hostEnd, portEnd ); + port = (unsigned short)atoi( portStr.c_str( ) ); + } + + // path + string path; + if( pathStart != s.end( ) ) { + path = string( pathStart, s.end( ) ); + } else { + // add trailing slash if path is empty + path = "/"; + } + + // TODO: fragment + string fragment; + + // TODO: query + string query; + + return URL( protocol, host, port, path, query, fragment ); +} + +/* + * protocol: +* +down vote +favorite +2 +I would like to ask if there's any Java package or library that have the standard URL normalization? + +5 Components of URL Representation + +http://www[dot]example[dot]com:8040/folder/exist?name=sky#head +scheme: http +authority: www.example.com:8040 +path: /folder/exist +query: ?name=sky +fragment: #head + +The 3 types of standard URL normalization + +Syntax-Based Normalization +Case normalization – convert all letter at scheme and authority components to lower case +Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore +Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’ + +domains: +https://github.com/john-kurkowski/tldextract +* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url +* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform +* + */ + +URL SimpleURLNormalizer::normalize( const URL url, const string s ) +{ + // See if the URL is parseable, if so it is an absolute URL + URL absUrl = parseUrl( s ); + if( absUrl != URL::Null ) { + return absUrl; + } + + // the new path starts with a slash, so it's absolute, ignore + // the old path + if( s[0] == '/' ) { + return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" ); + } + + // relative links have path, query and fragment only, try to + // append them cleverly + string oldPath = url.path( ); + + // find out the directory of the base path + size_t found = oldPath.rfind( "/" ); + if( !found ) { + // no directory at all, just a file, so the new path + // is returned + return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" ); + } + oldPath.erase( found ); + + string path = oldPath; + path.append( "/" ); + + // append the one from s + path.append( s ); + + // normalize sequences of "/", "." and ".." + normalizePath( path ); + + return URL( url.protocol( ), url.host( ), url.port( ), path, "", "" ); +} + +void SimpleURLNormalizer::normalizePath( string &path ) +{ + size_t found; + + found = path.find( "./" ); + while( found != string::npos ) { + path.replace( found, 2, "" ); + found = path.find( "./" ); + } +} + +static URLNormalizer *create( ) +{ + return new SimpleURLNormalizer( ); +} + +static void destroy( URLNormalizer *obj ) +{ + delete obj; +} + +ModuleRegistry<URLNormalizer> registry( "simple", &create, &destroy ); |