summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-29 13:08:00 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-29 13:08:00 +0200
commita29f0c14ed938c399531b696e93208044e2c6e07 (patch)
tree5f3bfcf4792ea843e57f30ac33e375aa934764b9
parentae148a31b891c760eb08a6f9a2c279f6d7dd6ec2 (diff)
downloadcrawler-a29f0c14ed938c399531b696e93208044e2c6e07.tar.gz
crawler-a29f0c14ed938c399531b696e93208044e2c6e07.tar.bz2
temporarily removed domain, domain filter is a host filter now
-rw-r--r--src/DomainURLFilter.cpp21
-rw-r--r--src/DomainURLFilter.hpp19
-rw-r--r--src/GNUmakefile2
-rw-r--r--src/HostURLFilter.cpp21
-rw-r--r--src/HostURLFilter.hpp19
-rw-r--r--src/SimpleURLNormalizer.cpp29
-rw-r--r--src/URL.hpp33
-rw-r--r--src/crawlingwolf.cpp10
-rw-r--r--tests/url/test1.cpp3
-rw-r--r--tests/url/test2.MUST3
-rw-r--r--tests/url/test2.cpp8
-rw-r--r--tests/url/test3.MUST3
-rw-r--r--tests/url/test4.MUST3
-rw-r--r--tests/url/test5.MUST3
-rw-r--r--tests/url/test6.MUST3
15 files changed, 101 insertions, 79 deletions
diff --git a/src/DomainURLFilter.cpp b/src/DomainURLFilter.cpp
deleted file mode 100644
index 7eb6560..0000000
--- a/src/DomainURLFilter.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "DomainURLFilter.hpp"
-#include "Logger.hpp"
-
-DomainURLFilter::DomainURLFilter( const std::set<std::string> domains )
- : m_domains( domains )
-{
-}
-
-bool DomainURLFilter::filter( const URL url )
-{
- string domain = url.domain( );
- bool res = ( m_domains.find( domain ) != m_domains.end( ) );
-
- LOG( logDEBUG ) << "Checking for domain '" << domain << "' in '" << url << "'";
-
- LOG( logINFO ) << ( res ? "Including " : "Excluding " )
- << "'" << url << "' "
- << "for domain '" << domain << "'";
-
- return res;
-}
diff --git a/src/DomainURLFilter.hpp b/src/DomainURLFilter.hpp
deleted file mode 100644
index 637ea67..0000000
--- a/src/DomainURLFilter.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __DOMAIN_URLLFILTER_H
-#define __DOMAIN_URLFILTER_H
-
-#include "URLFilter.hpp"
-
-#include <set>
-
-class DomainURLFilter : public URLFilter
-{
- public:
- DomainURLFilter( const std::set<std::string> domains );
-
- virtual bool filter( const URL url );
-
- protected:
- std::set<std::string> m_domains;
-};
-
-#endif
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 3d3d7b8..4abdd22 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -33,7 +33,7 @@ LOCAL_STATIC_LIB_OBJS = \
Deduper.o \
HTMLLinkExtractProcessor.o \
ProtocolURLFilter.o \
- DomainURLFilter.o \
+ HostURLFilter.o \
ChainURLFilter.o \
MemoryURLSeen.o \
SimpleURLNormalizer.o
diff --git a/src/HostURLFilter.cpp b/src/HostURLFilter.cpp
new file mode 100644
index 0000000..181f001
--- /dev/null
+++ b/src/HostURLFilter.cpp
@@ -0,0 +1,21 @@
+#include "HostURLFilter.hpp"
+#include "Logger.hpp"
+
+HostURLFilter::HostURLFilter( const std::set<std::string> hosts )
+ : m_hosts( hosts )
+{
+}
+
+bool HostURLFilter::filter( const URL url )
+{
+ string host = url.host( );
+ bool res = ( m_hosts.find( host ) != m_hosts.end( ) );
+
+ LOG( logDEBUG ) << "Checking for host '" << host << "' in '" << url << "'";
+
+ LOG( logINFO ) << ( res ? "Including " : "Excluding " )
+ << "'" << url << "' "
+ << "for host '" << host << "'";
+
+ return res;
+}
diff --git a/src/HostURLFilter.hpp b/src/HostURLFilter.hpp
new file mode 100644
index 0000000..aa91e09
--- /dev/null
+++ b/src/HostURLFilter.hpp
@@ -0,0 +1,19 @@
+#ifndef __HOST_URLLFILTER_H
+#define __HOST_URLFILTER_H
+
+#include "URLFilter.hpp"
+
+#include <set>
+
+class HostURLFilter : public URLFilter
+{
+ public:
+ HostURLFilter( const std::set<std::string> hosts );
+
+ virtual bool filter( const URL url );
+
+ protected:
+ std::set<std::string> m_hosts;
+};
+
+#endif
diff --git a/src/SimpleURLNormalizer.cpp b/src/SimpleURLNormalizer.cpp
index a74d9e4..f07e39a 100644
--- a/src/SimpleURLNormalizer.cpp
+++ b/src/SimpleURLNormalizer.cpp
@@ -60,8 +60,9 @@ URL SimpleURLNormalizer::parseUrl( const string s )
string fragment;
// TODO: query
+ string query;
- return URL( protocol, host, port, path, fragment );
+ return URL( protocol, host, port, path, query, fragment );
}
/*
@@ -88,21 +89,23 @@ Case normalization – convert all letter at scheme and authority components to
Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore
Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’
-Scheme-Based Normalization
-Add trailing ‘/’ after the authority component of URL
-Remove default port number, such as 80 for http scheme
-Truncate the fragment of URL
-
-Protocol-Based Normalization
-Only appropriate when the results of accessing the resources are equivalent
-For example, example.com/data is directed to example.com/data/ by origin server
+domains:
+https://github.com/john-kurkowski/tldextract
+* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url
+* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform
+*
*/
URL SimpleURLNormalizer::normalize( const URL url, const string s )
{
- (void)url;
- (void)s;
-
- return URL::Null;
+ // See if the URL is parseably, if so it is an absolute URL
+ URL absUrl = parseUrl( s );
+ if( absUrl != URL::Null ) {
+ return absUrl;
+ }
+
+ // relative links have path, query and fragment only, try to
+ // append them cleverly
+ return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
}
diff --git a/src/URL.hpp b/src/URL.hpp
index 58e1d0a..32b1501 100644
--- a/src/URL.hpp
+++ b/src/URL.hpp
@@ -13,21 +13,22 @@ class URL {
string m_host;
unsigned short m_port;
string m_path;
+ string m_query;
string m_fragment;
public:
URL( )
- : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_fragment( "" )
+ : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_query( "" ), m_fragment( "" )
{
}
URL( const URL& url )
- : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_fragment( url.m_fragment )
+ : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_query( url.m_query ), m_fragment( url.m_fragment )
{
}
- URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _fragment )
- : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_fragment( _fragment )
+ URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _query, const std::string _fragment )
+ : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_query( _query ), m_fragment( _fragment )
{
}
@@ -37,6 +38,7 @@ class URL {
this->m_port = u.m_port;
this->m_host = u.m_host;
this->m_path = u.m_path;
+ this->m_query = u.m_query;
this->m_fragment = u.m_fragment;
}
return *this;
@@ -51,13 +53,7 @@ class URL {
{
return m_host;
}
-
- const string domain( ) const
- {
- // TODO: implement using heuristics and top level domain lists
- return m_host;
- }
-
+
unsigned short port( ) const
{
return m_port;
@@ -68,6 +64,11 @@ class URL {
return m_path;
}
+ const string query( ) const
+ {
+ return m_query;
+ }
+
std::string fragment( ) const
{
return m_fragment;
@@ -88,6 +89,7 @@ class URL {
m_host != other.m_host &&
m_port != other.m_port &&
m_path != other.m_path &&
+ m_query != other.m_query &&
m_fragment != other.m_fragment );
}
@@ -97,6 +99,7 @@ class URL {
m_host == other.m_host &&
m_port == other.m_port &&
m_path == other.m_path &&
+ m_query == other.m_query &&
m_fragment == other.m_fragment );
}
@@ -106,6 +109,7 @@ class URL {
m_host < other.m_host &&
m_port < other.m_port &&
m_path < other.m_path &&
+ m_query < other.m_query &&
m_fragment < other.m_fragment );
}
@@ -128,10 +132,17 @@ inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&
}
s << u.protocol( ) << "://" << u.host( );
+
if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) {
s << ":" << u.port( );
}
+
s << u.path( );
+
+ if( !u.query( ).empty( ) ) {
+ s << "?" << u.query( );
+ }
+
if( !u.fragment( ).empty( ) ) {
s << "#" << u.fragment( );
}
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 0028624..2e82ccc 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -4,7 +4,7 @@
#include "HTMLLinkExtractProcessor.hpp"
#include "ChainURLFilter.hpp"
#include "ProtocolURLFilter.hpp"
-#include "DomainURLFilter.hpp"
+#include "HostURLFilter.hpp"
#include "MemoryURLSeen.hpp"
#include "SimpleURLNormalizer.hpp"
@@ -26,11 +26,11 @@ int main( void )
protocols.insert( "https" );
ProtocolURLFilter protocolFilter( protocols );
- set<string> domains;
- domains.insert( "www.andreasbaumann.cc" );
- DomainURLFilter domainFilter( domains );
+ set<string> hosts;
+ hosts.insert( "www.andreasbaumann.cc" );
+ HostURLFilter hostFilter( hosts );
- ChainURLFilter filters( &protocolFilter, &domainFilter );
+ ChainURLFilter filters( &protocolFilter, &hostFilter );
URLNormalizer *normalizer = new SimpleURLNormalizer( );
diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp
index ac6a086..23c7d74 100644
--- a/tests/url/test1.cpp
+++ b/tests/url/test1.cpp
@@ -25,9 +25,10 @@ int main( int argc, char *argv[] )
}
cout << "protocol: " << url.protocol( ) << endl
+ << "host: " << url.host( ) << endl
<< "port: " << url.port( ) << endl
- << "domain: " << url.domain( ) << endl
<< "path: " << url.path( ) << endl
+ << "query: " << url.query( ) << endl
<< "fragment: " << url.fragment( ) << endl;
cout << "URL: " << url << endl;
diff --git a/tests/url/test2.MUST b/tests/url/test2.MUST
index 3172868..92158a6 100644
--- a/tests/url/test2.MUST
+++ b/tests/url/test2.MUST
@@ -1,6 +1,7 @@
protocol: http
+host: www.andreasbaumann.cc
port: 80
-domain: www.andreasbaumann.cc
path: /
+query:
fragment:
URL: http://www.andreasbaumann.cc/
diff --git a/tests/url/test2.cpp b/tests/url/test2.cpp
index 7bf135a..4b6aa0d 100644
--- a/tests/url/test2.cpp
+++ b/tests/url/test2.cpp
@@ -23,11 +23,13 @@ int main( int argc, char *argv[] )
URL url = normalizer->normalize( baseUrl, partialUrlString );
cout << "protocol: " << url.protocol( ) << endl
+ << "host: " << url.host( ) << endl
<< "port: " << url.port( ) << endl
- << "domain: " << url.domain( ) << endl
<< "path: " << url.path( ) << endl
- << "fragment: " << url.fragment( ) << endl
- << endl;
+ << "query: " << url.query( ) << endl
+ << "fragment: " << url.fragment( ) << endl;
+
+ cout << "URL: " << url << endl;
delete normalizer;
diff --git a/tests/url/test3.MUST b/tests/url/test3.MUST
index 3172868..92158a6 100644
--- a/tests/url/test3.MUST
+++ b/tests/url/test3.MUST
@@ -1,6 +1,7 @@
protocol: http
+host: www.andreasbaumann.cc
port: 80
-domain: www.andreasbaumann.cc
path: /
+query:
fragment:
URL: http://www.andreasbaumann.cc/
diff --git a/tests/url/test4.MUST b/tests/url/test4.MUST
index f7d1220..0649e10 100644
--- a/tests/url/test4.MUST
+++ b/tests/url/test4.MUST
@@ -1,6 +1,7 @@
protocol: http
+host: www.andreasbaumann.cc
port: 80
-domain: www.andreasbaumann.cc
path: /index.html
+query:
fragment:
URL: http://www.andreasbaumann.cc/index.html
diff --git a/tests/url/test5.MUST b/tests/url/test5.MUST
index f7d1220..0649e10 100644
--- a/tests/url/test5.MUST
+++ b/tests/url/test5.MUST
@@ -1,6 +1,7 @@
protocol: http
+host: www.andreasbaumann.cc
port: 80
-domain: www.andreasbaumann.cc
path: /index.html
+query:
fragment:
URL: http://www.andreasbaumann.cc/index.html
diff --git a/tests/url/test6.MUST b/tests/url/test6.MUST
index 5ed0d82..de9b556 100644
--- a/tests/url/test6.MUST
+++ b/tests/url/test6.MUST
@@ -1,6 +1,7 @@
protocol: http
+host: www.andreasbaumann.cc
port: 8080
-domain: www.andreasbaumann.cc
path: /index.html
+query:
fragment:
URL: http://www.andreasbaumann.cc:8080/index.html