summaryrefslogtreecommitdiff
path: root/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-06 17:16:08 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-06 17:16:08 +0200
commit01bcb80ac096de72694135dff37e2ff70c2ab572 (patch)
tree453cd7da8b1cdb67bee7a1eb1d450d94db6239ea /src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
parente59855fd87bea3641846d6b589059230b08043f1 (diff)
downloadcrawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.gz
crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.bz2
first steps to make URL loader loadable
Diffstat (limited to 'src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp')
-rw-r--r--src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp159
1 files changed, 159 insertions, 0 deletions
diff --git a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
new file mode 100644
index 0000000..328a82b
--- /dev/null
+++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
@@ -0,0 +1,159 @@
+#include <string>
+#include <algorithm>
+
+#include "SimpleURLNormalizer.hpp"
+
+using namespace std;
+
+SimpleURLNormalizer::SimpleURLNormalizer( )
+{
+}
+
+URL SimpleURLNormalizer::parseUrl( const string s )
+{
+ if( s.empty( ) ) {
+ return URL::Null;
+ }
+
+ // protocol
+ string protocol;
+ string::const_iterator protocolStart = s.begin( );
+ string::const_iterator protocolEnd = find( protocolStart, s.end( ), ':' );
+ if( protocolStart == s.end( ) ) {
+ // no protocol separator ':', not really legal
+ return URL::Null;
+ }
+ protocol = &*protocolEnd;
+ if( protocol.length( ) < 3 || protocol.substr( 0, 3 ) != "://" ) {
+ // no protocol, not really legal
+ return URL::Null;
+ }
+ protocol = string( protocolStart, protocolEnd );
+ protocolEnd += 3;
+
+ // host
+ string host;
+ string::const_iterator hostStart = protocolEnd;
+ string::const_iterator pathStart = find( hostStart, s.end( ), '/' );
+ string::const_iterator hostEnd = find( protocolEnd, pathStart, ':' );
+ host = string( hostStart, hostEnd );
+
+ // port
+ unsigned short port = URL::defaultPort( protocol );
+ if( hostEnd != s.end( ) && *hostEnd == ':' ) {
+ hostEnd++;
+ string::const_iterator portEnd = pathStart;
+ string portStr = string( hostEnd, portEnd );
+ port = (unsigned short)atoi( portStr.c_str( ) );
+ }
+
+ // path
+ string path;
+ if( pathStart != s.end( ) ) {
+ path = string( pathStart, s.end( ) );
+ } else {
+ // add trailing slash if path is empty
+ path = "/";
+ }
+
+ // TODO: fragment
+ string fragment;
+
+ // TODO: query
+ string query;
+
+ return URL( protocol, host, port, path, query, fragment );
+}
+
+/*
+ * protocol:
+*
+down vote
+favorite
+2
+I would like to ask if there's any Java package or library that have the standard URL normalization?
+
+5 Components of URL Representation
+
+http://www[dot]example[dot]com:8040/folder/exist?name=sky#head
+scheme: http
+authority: www.example.com:8040
+path: /folder/exist
+query: ?name=sky
+fragment: #head
+
+The 3 types of standard URL normalization
+
+Syntax-Based Normalization
+Case normalization – convert all letter at scheme and authority components to lower case
+Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore
+Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’
+
+domains:
+https://github.com/john-kurkowski/tldextract
+* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url
+* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform
+*
+ */
+
+URL SimpleURLNormalizer::normalize( const URL url, const string s )
+{
+ // See if the URL is parseable, if so it is an absolute URL
+ URL absUrl = parseUrl( s );
+ if( absUrl != URL::Null ) {
+ return absUrl;
+ }
+
+ // the new path starts with a slash, so it's absolute, ignore
+ // the old path
+ if( s[0] == '/' ) {
+ return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
+ }
+
+ // relative links have path, query and fragment only, try to
+ // append them cleverly
+ string oldPath = url.path( );
+
+ // find out the directory of the base path
+ size_t found = oldPath.rfind( "/" );
+ if( !found ) {
+ // no directory at all, just a file, so the new path
+ // is returned
+ return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
+ }
+ oldPath.erase( found );
+
+ string path = oldPath;
+ path.append( "/" );
+
+ // append the one from s
+ path.append( s );
+
+ // normalize sequences of "/", "." and ".."
+ normalizePath( path );
+
+ return URL( url.protocol( ), url.host( ), url.port( ), path, "", "" );
+}
+
+void SimpleURLNormalizer::normalizePath( string &path )
+{
+ size_t found;
+
+ found = path.find( "./" );
+ while( found != string::npos ) {
+ path.replace( found, 2, "" );
+ found = path.find( "./" );
+ }
+}
+
+static URLNormalizer *create( )
+{
+ return new SimpleURLNormalizer( );
+}
+
+static void destroy( URLNormalizer *obj )
+{
+ delete obj;
+}
+
+ModuleRegistry<URLNormalizer> registry( "simple", &create, &destroy );