temporarily removed domain, domain filter is a host filter now

author: Andreas Baumann <abaumann@yahoo.com> 2012-07-29 13:08:00 +0200
committer: Andreas Baumann <abaumann@yahoo.com> 2012-07-29 13:08:00 +0200
commit: a29f0c14ed938c399531b696e93208044e2c6e07 (patch)
tree: 5f3bfcf4792ea843e57f30ac33e375aa934764b9 /src
parent: ae148a31b891c760eb08a6f9a2c279f6d7dd6ec2 (diff)
download: crawler-a29f0c14ed938c399531b696e93208044e2c6e07.tar.gz
crawler-a29f0c14ed938c399531b696e93208044e2c6e07.tar.bz2
8 files changed, 84 insertions, 70 deletions
diff --git a/src/DomainURLFilter.cpp b/src/DomainURLFilter.cpp
deleted file mode 100644
index 7eb6560..0000000
--- a/src/DomainURLFilter.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "DomainURLFilter.hpp"
-#include "Logger.hpp"
-
-DomainURLFilter::DomainURLFilter( const std::set<std::string> domains )
-	: m_domains( domains )
-{
-}
-		
-bool DomainURLFilter::filter( const URL url )
-{
-	string domain = url.domain( );
-	bool res = ( m_domains.find( domain ) != m_domains.end( ) );
-	
-	LOG( logDEBUG )	<< "Checking for domain '" << domain << "' in '" << url << "'";
-	
-	LOG( logINFO ) 	<< ( res ? "Including " : "Excluding " )
-			<< "'" << url << "' "
-			<< "for domain '" << domain << "'";
-	
-	return res;
-}
diff --git a/src/DomainURLFilter.hpp b/src/DomainURLFilter.hpp
deleted file mode 100644
index 637ea67..0000000
--- a/src/DomainURLFilter.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __DOMAIN_URLLFILTER_H
-#define __DOMAIN_URLFILTER_H
-
-#include "URLFilter.hpp"
-
-#include <set>
-
-class DomainURLFilter : public URLFilter
-{
-	public:
-		DomainURLFilter( const std::set<std::string> domains );
-				
-		virtual bool filter( const URL url );
-		
-	protected:
-		std::set<std::string> m_domains;
-};
-
-#endif
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 3d3d7b8..4abdd22 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -33,7 +33,7 @@ LOCAL_STATIC_LIB_OBJS = \
 	Deduper.o \
 	HTMLLinkExtractProcessor.o \
 	ProtocolURLFilter.o \
-	DomainURLFilter.o \
+	HostURLFilter.o \
 	ChainURLFilter.o \
 	MemoryURLSeen.o \
 	SimpleURLNormalizer.o
diff --git a/src/HostURLFilter.cpp b/src/HostURLFilter.cpp
new file mode 100644
index 0000000..181f001
--- /dev/null
+++ b/src/HostURLFilter.cpp
@@ -0,0 +1,21 @@
+#include "HostURLFilter.hpp"
+#include "Logger.hpp"
+
+HostURLFilter::HostURLFilter( const std::set<std::string> hosts )
+	: m_hosts( hosts )
+{
+}
+		
+bool HostURLFilter::filter( const URL url )
+{
+	string host = url.host( );
+	bool res = ( m_hosts.find( host ) != m_hosts.end( ) );
+	
+	LOG( logDEBUG )	<< "Checking for host '" << host << "' in '" << url << "'";
+	
+	LOG( logINFO ) 	<< ( res ? "Including " : "Excluding " )
+			<< "'" << url << "' "
+			<< "for host '" << host << "'";
+	
+	return res;
+}
diff --git a/src/HostURLFilter.hpp b/src/HostURLFilter.hpp
new file mode 100644
index 0000000..aa91e09
--- /dev/null
+++ b/src/HostURLFilter.hpp
@@ -0,0 +1,19 @@
+#ifndef __HOST_URLLFILTER_H
+#define __HOST_URLFILTER_H
+
+#include "URLFilter.hpp"
+
+#include <set>
+
+class HostURLFilter : public URLFilter
+{
+	public:
+		HostURLFilter( const std::set<std::string> hosts );
+				
+		virtual bool filter( const URL url );
+		
+	protected:
+		std::set<std::string> m_hosts;
+};
+
+#endif
diff --git a/src/SimpleURLNormalizer.cpp b/src/SimpleURLNormalizer.cpp
index a74d9e4..f07e39a 100644
--- a/src/SimpleURLNormalizer.cpp
+++ b/src/SimpleURLNormalizer.cpp
@@ -60,8 +60,9 @@ URL SimpleURLNormalizer::parseUrl( const string s )
 	string fragment;
 	
 	// TODO: query
+	string query;
 	
-	return URL( protocol, host, port, path, fragment );
+	return URL( protocol, host, port, path, query, fragment );
 }
 
 /*
@@ -88,21 +89,23 @@ Case normalization – convert all letter at scheme and authority components to
 Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore
 Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’
 
-Scheme-Based Normalization 
-Add trailing ‘/’ after the authority component of URL
-Remove default port number, such as 80 for http scheme 
-Truncate the fragment of URL
-
-Protocol-Based Normalization 
-Only appropriate when the results of accessing the resources are equivalent
-For example, example.com/data is directed to example.com/data/ by origin server
+domains:
+https://github.com/john-kurkowski/tldextract
+* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url
+* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform
+* 
  */
 
 URL SimpleURLNormalizer::normalize( const URL url, const string s )
 {
-	(void)url;
-	(void)s;
-	
-	return URL::Null;
+	// See if the URL is parseably, if so it is an absolute URL
+	URL absUrl = parseUrl( s );
+	if( absUrl != URL::Null ) {
+		return absUrl;
+	}
+
+	// relative links have path, query and fragment only, try to
+	// append them cleverly
+	return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
 }
 
diff --git a/src/URL.hpp b/src/URL.hpp
index 58e1d0a..32b1501 100644
--- a/src/URL.hpp
+++ b/src/URL.hpp
@@ -13,21 +13,22 @@ class URL {
 		string m_host;
 		unsigned short m_port;
 		string m_path;
+		string m_query;
 		string m_fragment;
 		
 	public:
 		URL( )
-			: m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_fragment( "" )
+			: m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_query( "" ), m_fragment( "" )
 		{
 		}
 		
 		URL( const URL& url )
-			: m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_fragment( url.m_fragment )
+			: m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_query( url.m_query ), m_fragment( url.m_fragment )
 		{			
 		}
 		
-		URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _fragment )
-			: m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_fragment( _fragment )
+		URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _query, const std::string _fragment )
+			: m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_query( _query ), m_fragment( _fragment )
 		{
 		}
 		
@@ -37,6 +38,7 @@ class URL {
 				this->m_port = u.m_port;
 				this->m_host = u.m_host;
 				this->m_path = u.m_path;
+				this->m_query = u.m_query;
 				this->m_fragment = u.m_fragment;
 			}
 			return *this;
@@ -51,13 +53,7 @@ class URL {
 		{
 			return m_host;
 		}
-		
-		const string domain( ) const
-		{
-			// TODO: implement using heuristics and top level domain lists
-			return m_host;
-		}
-		
+				
 		unsigned short port( ) const
 		{
 			return m_port;
@@ -68,6 +64,11 @@ class URL {
 			return m_path;
 		}
 		
+		const string query( ) const
+		{
+			return m_query;
+		}
+		
 		std::string fragment( ) const
 		{
 			return m_fragment;
@@ -88,6 +89,7 @@ class URL {
 				m_host != other.m_host &&
 				m_port != other.m_port &&
 				m_path != other.m_path &&
+				m_query != other.m_query &&
 				m_fragment != other.m_fragment );
 		}
 
@@ -97,6 +99,7 @@ class URL {
 				m_host == other.m_host &&
 				m_port == other.m_port &&
 				m_path == other.m_path &&
+				m_query == other.m_query &&
 				m_fragment == other.m_fragment );
 		}
 
@@ -106,6 +109,7 @@ class URL {
 				m_host < other.m_host &&
 				m_port < other.m_port &&
 				m_path < other.m_path &&
+				m_query < other.m_query &&
 				m_fragment < other.m_fragment );
 		}
 
@@ -128,10 +132,17 @@ inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&
 	}
 	
 	s << u.protocol( ) << "://" << u.host( );
+
 	if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) {
 		s << ":" << u.port( );
 	}
+	
 	s << u.path( );
+	
+	if( !u.query( ).empty( ) ) {
+		s << "?" << u.query( );
+	}
+	
 	if( !u.fragment( ).empty( ) ) {
 		 s << "#" << u.fragment( );
 	}
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 0028624..2e82ccc 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -4,7 +4,7 @@
 #include "HTMLLinkExtractProcessor.hpp"
 #include "ChainURLFilter.hpp"
 #include "ProtocolURLFilter.hpp"
-#include "DomainURLFilter.hpp"
+#include "HostURLFilter.hpp"
 #include "MemoryURLSeen.hpp"
 #include "SimpleURLNormalizer.hpp"
 
@@ -26,11 +26,11 @@ int main( void )
 	protocols.insert( "https" );
 	ProtocolURLFilter protocolFilter( protocols );
 	
-	set<string> domains;
-	domains.insert( "www.andreasbaumann.cc" );
-	DomainURLFilter domainFilter( domains );
+	set<string> hosts;
+	hosts.insert( "www.andreasbaumann.cc" );
+	HostURLFilter hostFilter( hosts );
 	
-	ChainURLFilter filters( &protocolFilter, &domainFilter );
+	ChainURLFilter filters( &protocolFilter, &hostFilter );
 
 	URLNormalizer *normalizer = new SimpleURLNormalizer( );
author	Andreas Baumann <abaumann@yahoo.com>	2012-07-29 13:08:00 +0200
committer	Andreas Baumann <abaumann@yahoo.com>	2012-07-29 13:08:00 +0200
commit	a29f0c14ed938c399531b696e93208044e2c6e07 (patch)
tree	5f3bfcf4792ea843e57f30ac33e375aa934764b9 /src
parent	ae148a31b891c760eb08a6f9a2c279f6d7dd6ec2 (diff)
download	crawler-a29f0c14ed938c399531b696e93208044e2c6e07.tar.gz crawler-a29f0c14ed938c399531b696e93208044e2c6e07.tar.bz2