first steps to make URL loader loadable

author: Andreas Baumann <abaumann@yahoo.com> 2012-08-06 17:16:08 +0200
committer: Andreas Baumann <abaumann@yahoo.com> 2012-08-06 17:16:08 +0200
commit: 01bcb80ac096de72694135dff37e2ff70c2ab572 (patch)
tree: 453cd7da8b1cdb67bee7a1eb1d450d94db6239ea /src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
parent: e59855fd87bea3641846d6b589059230b08043f1 (diff)
download: crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.gz
crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.bz2
1 files changed, 159 insertions, 0 deletions
diff --git a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
new file mode 100644
index 0000000..328a82b
--- /dev/null
+++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
@@ -0,0 +1,159 @@
+#include <string>
+#include <algorithm>
+
+#include "SimpleURLNormalizer.hpp"
+
+using namespace std;
+
+SimpleURLNormalizer::SimpleURLNormalizer( )
+{
+}
+
+URL SimpleURLNormalizer::parseUrl( const string s )
+{
+	if( s.empty( ) ) {
+		return URL::Null;
+	}
+	
+	// protocol
+	string protocol;
+	string::const_iterator protocolStart = s.begin( );
+	string::const_iterator protocolEnd = find( protocolStart, s.end( ), ':' );
+	if( protocolStart == s.end( ) ) {
+		// no protocol separator ':', not really legal
+		return URL::Null;
+	}
+	protocol = &*protocolEnd;
+	if( protocol.length( ) < 3 || protocol.substr( 0, 3 ) != "://" ) {
+		// no protocol, not really legal
+		return URL::Null;
+	}
+	protocol = string( protocolStart, protocolEnd );
+	protocolEnd += 3;
+
+	// host
+	string host;
+	string::const_iterator hostStart = protocolEnd;
+	string::const_iterator pathStart = find( hostStart, s.end( ), '/' );
+	string::const_iterator hostEnd = find( protocolEnd, pathStart, ':' );
+	host = string( hostStart, hostEnd );
+	
+	// port
+	unsigned short port = URL::defaultPort( protocol );
+	if( hostEnd != s.end( ) && *hostEnd == ':' ) {
+		hostEnd++;
+		string::const_iterator portEnd = pathStart;
+		string portStr = string( hostEnd, portEnd );
+		port = (unsigned short)atoi( portStr.c_str( ) );
+	}
+	
+	// path
+	string path;
+	if( pathStart != s.end( ) ) {
+		path = string( pathStart, s.end( ) );
+	} else {
+		// add trailing slash if path is empty
+		path = "/";
+	}
+	
+	// TODO: fragment
+	string fragment;
+	
+	// TODO: query
+	string query;
+	
+	return URL( protocol, host, port, path, query, fragment );
+}
+
+/*
+ * protocol:
+* 
+down vote
+favorite
+2	
+I would like to ask if there's any Java package or library that have the standard URL normalization?
+
+5 Components of URL Representation
+
+http://www[dot]example[dot]com:8040/folder/exist?name=sky#head 
+scheme: http 
+authority: www.example.com:8040
+path: /folder/exist
+query: ?name=sky
+fragment: #head
+
+The 3 types of standard URL normalization 
+
+Syntax-Based Normalization 
+Case normalization – convert all letter at scheme and authority components to lower case
+Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore
+Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’
+
+domains:
+https://github.com/john-kurkowski/tldextract
+* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url
+* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform
+* 
+ */
+
+URL SimpleURLNormalizer::normalize( const URL url, const string s )
+{
+	// See if the URL is parseable, if so it is an absolute URL
+	URL absUrl = parseUrl( s );
+	if( absUrl != URL::Null ) {
+		return absUrl;
+	}
+
+	// the new path starts with a slash, so it's absolute, ignore
+	// the old path
+	if( s[0] == '/' ) {
+		return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
+	}
+
+	// relative links have path, query and fragment only, try to
+	// append them cleverly
+	string oldPath = url.path( );
+	
+	// find out the directory of the base path
+	size_t found = oldPath.rfind( "/" );
+	if( !found ) {
+		// no directory at all, just a file, so the new path
+		// is returned
+		return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
+	}
+	oldPath.erase( found );
+
+	string path = oldPath;
+	path.append( "/" );
+
+	// append the one from s
+	path.append( s );
+	
+	// normalize sequences of "/", "." and ".."
+	normalizePath( path );
+
+	return URL( url.protocol( ), url.host( ), url.port( ), path, "", "" );
+}
+
+void SimpleURLNormalizer::normalizePath( string &path )
+{
+	size_t found;
+
+	found = path.find( "./" );
+	while( found != string::npos ) {
+		path.replace( found, 2, "" );
+		found = path.find( "./" );
+	}
+}
+
+static URLNormalizer *create( )
+{
+	return new SimpleURLNormalizer( );
+}
+
+static void destroy( URLNormalizer *obj )
+{
+	delete obj;
+}
+
+ModuleRegistry<URLNormalizer> registry( "simple", &create, &destroy );
author	Andreas Baumann <abaumann@yahoo.com>	2012-08-06 17:16:08 +0200
committer	Andreas Baumann <abaumann@yahoo.com>	2012-08-06 17:16:08 +0200
commit	01bcb80ac096de72694135dff37e2ff70c2ab572 (patch)
tree	453cd7da8b1cdb67bee7a1eb1d450d94db6239ea /src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
parent	e59855fd87bea3641846d6b589059230b08043f1 (diff)
download	crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.gz crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.bz2