diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-29 13:08:00 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-29 13:08:00 +0200 |
commit | a29f0c14ed938c399531b696e93208044e2c6e07 (patch) | |
tree | 5f3bfcf4792ea843e57f30ac33e375aa934764b9 /src | |
parent | ae148a31b891c760eb08a6f9a2c279f6d7dd6ec2 (diff) | |
download | crawler-a29f0c14ed938c399531b696e93208044e2c6e07.tar.gz crawler-a29f0c14ed938c399531b696e93208044e2c6e07.tar.bz2 |
temporarily removed domain, domain filter is a host filter now
Diffstat (limited to 'src')
-rw-r--r-- | src/DomainURLFilter.cpp | 21 | ||||
-rw-r--r-- | src/DomainURLFilter.hpp | 19 | ||||
-rw-r--r-- | src/GNUmakefile | 2 | ||||
-rw-r--r-- | src/HostURLFilter.cpp | 21 | ||||
-rw-r--r-- | src/HostURLFilter.hpp | 19 | ||||
-rw-r--r-- | src/SimpleURLNormalizer.cpp | 29 | ||||
-rw-r--r-- | src/URL.hpp | 33 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 10 |
8 files changed, 84 insertions, 70 deletions
diff --git a/src/DomainURLFilter.cpp b/src/DomainURLFilter.cpp deleted file mode 100644 index 7eb6560..0000000 --- a/src/DomainURLFilter.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "DomainURLFilter.hpp" -#include "Logger.hpp" - -DomainURLFilter::DomainURLFilter( const std::set<std::string> domains ) - : m_domains( domains ) -{ -} - -bool DomainURLFilter::filter( const URL url ) -{ - string domain = url.domain( ); - bool res = ( m_domains.find( domain ) != m_domains.end( ) ); - - LOG( logDEBUG ) << "Checking for domain '" << domain << "' in '" << url << "'"; - - LOG( logINFO ) << ( res ? "Including " : "Excluding " ) - << "'" << url << "' " - << "for domain '" << domain << "'"; - - return res; -} diff --git a/src/DomainURLFilter.hpp b/src/DomainURLFilter.hpp deleted file mode 100644 index 637ea67..0000000 --- a/src/DomainURLFilter.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __DOMAIN_URLLFILTER_H -#define __DOMAIN_URLFILTER_H - -#include "URLFilter.hpp" - -#include <set> - -class DomainURLFilter : public URLFilter -{ - public: - DomainURLFilter( const std::set<std::string> domains ); - - virtual bool filter( const URL url ); - - protected: - std::set<std::string> m_domains; -}; - -#endif diff --git a/src/GNUmakefile b/src/GNUmakefile index 3d3d7b8..4abdd22 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -33,7 +33,7 @@ LOCAL_STATIC_LIB_OBJS = \ Deduper.o \ HTMLLinkExtractProcessor.o \ ProtocolURLFilter.o \ - DomainURLFilter.o \ + HostURLFilter.o \ ChainURLFilter.o \ MemoryURLSeen.o \ SimpleURLNormalizer.o diff --git a/src/HostURLFilter.cpp b/src/HostURLFilter.cpp new file mode 100644 index 0000000..181f001 --- /dev/null +++ b/src/HostURLFilter.cpp @@ -0,0 +1,21 @@ +#include "HostURLFilter.hpp" +#include "Logger.hpp" + +HostURLFilter::HostURLFilter( const std::set<std::string> hosts ) + : m_hosts( hosts ) +{ +} + +bool HostURLFilter::filter( const URL url ) +{ + string host = url.host( ); + bool res = ( m_hosts.find( host ) != m_hosts.end( ) ); + + LOG( logDEBUG ) << "Checking for host '" << host << "' in '" << url << "'"; + + LOG( logINFO ) << ( res ? "Including " : "Excluding " ) + << "'" << url << "' " + << "for host '" << host << "'"; + + return res; +} diff --git a/src/HostURLFilter.hpp b/src/HostURLFilter.hpp new file mode 100644 index 0000000..aa91e09 --- /dev/null +++ b/src/HostURLFilter.hpp @@ -0,0 +1,19 @@ +#ifndef __HOST_URLLFILTER_H +#define __HOST_URLFILTER_H + +#include "URLFilter.hpp" + +#include <set> + +class HostURLFilter : public URLFilter +{ + public: + HostURLFilter( const std::set<std::string> hosts ); + + virtual bool filter( const URL url ); + + protected: + std::set<std::string> m_hosts; +}; + +#endif diff --git a/src/SimpleURLNormalizer.cpp b/src/SimpleURLNormalizer.cpp index a74d9e4..f07e39a 100644 --- a/src/SimpleURLNormalizer.cpp +++ b/src/SimpleURLNormalizer.cpp @@ -60,8 +60,9 @@ URL SimpleURLNormalizer::parseUrl( const string s ) string fragment; // TODO: query + string query; - return URL( protocol, host, port, path, fragment ); + return URL( protocol, host, port, path, query, fragment ); } /* @@ -88,21 +89,23 @@ Case normalization – convert all letter at scheme and authority components to Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’ -Scheme-Based Normalization -Add trailing ‘/’ after the authority component of URL -Remove default port number, such as 80 for http scheme -Truncate the fragment of URL - -Protocol-Based Normalization -Only appropriate when the results of accessing the resources are equivalent -For example, example.com/data is directed to example.com/data/ by origin server +domains: +https://github.com/john-kurkowski/tldextract +* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url +* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform +* */ URL SimpleURLNormalizer::normalize( const URL url, const string s ) { - (void)url; - (void)s; - - return URL::Null; + // See if the URL is parseably, if so it is an absolute URL + URL absUrl = parseUrl( s ); + if( absUrl != URL::Null ) { + return absUrl; + } + + // relative links have path, query and fragment only, try to + // append them cleverly + return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" ); } diff --git a/src/URL.hpp b/src/URL.hpp index 58e1d0a..32b1501 100644 --- a/src/URL.hpp +++ b/src/URL.hpp @@ -13,21 +13,22 @@ class URL { string m_host; unsigned short m_port; string m_path; + string m_query; string m_fragment; public: URL( ) - : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_fragment( "" ) + : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_query( "" ), m_fragment( "" ) { } URL( const URL& url ) - : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_fragment( url.m_fragment ) + : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_query( url.m_query ), m_fragment( url.m_fragment ) { } - URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _fragment ) - : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_fragment( _fragment ) + URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _query, const std::string _fragment ) + : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_query( _query ), m_fragment( _fragment ) { } @@ -37,6 +38,7 @@ class URL { this->m_port = u.m_port; this->m_host = u.m_host; this->m_path = u.m_path; + this->m_query = u.m_query; this->m_fragment = u.m_fragment; } return *this; @@ -51,13 +53,7 @@ class URL { { return m_host; } - - const string domain( ) const - { - // TODO: implement using heuristics and top level domain lists - return m_host; - } - + unsigned short port( ) const { return m_port; @@ -68,6 +64,11 @@ class URL { return m_path; } + const string query( ) const + { + return m_query; + } + std::string fragment( ) const { return m_fragment; @@ -88,6 +89,7 @@ class URL { m_host != other.m_host && m_port != other.m_port && m_path != other.m_path && + m_query != other.m_query && m_fragment != other.m_fragment ); } @@ -97,6 +99,7 @@ class URL { m_host == other.m_host && m_port == other.m_port && m_path == other.m_path && + m_query == other.m_query && m_fragment == other.m_fragment ); } @@ -106,6 +109,7 @@ class URL { m_host < other.m_host && m_port < other.m_port && m_path < other.m_path && + m_query < other.m_query && m_fragment < other.m_fragment ); } @@ -128,10 +132,17 @@ inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>& } s << u.protocol( ) << "://" << u.host( ); + if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) { s << ":" << u.port( ); } + s << u.path( ); + + if( !u.query( ).empty( ) ) { + s << "?" << u.query( ); + } + if( !u.fragment( ).empty( ) ) { s << "#" << u.fragment( ); } diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index 0028624..2e82ccc 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -4,7 +4,7 @@ #include "HTMLLinkExtractProcessor.hpp" #include "ChainURLFilter.hpp" #include "ProtocolURLFilter.hpp" -#include "DomainURLFilter.hpp" +#include "HostURLFilter.hpp" #include "MemoryURLSeen.hpp" #include "SimpleURLNormalizer.hpp" @@ -26,11 +26,11 @@ int main( void ) protocols.insert( "https" ); ProtocolURLFilter protocolFilter( protocols ); - set<string> domains; - domains.insert( "www.andreasbaumann.cc" ); - DomainURLFilter domainFilter( domains ); + set<string> hosts; + hosts.insert( "www.andreasbaumann.cc" ); + HostURLFilter hostFilter( hosts ); - ChainURLFilter filters( &protocolFilter, &domainFilter ); + ChainURLFilter filters( &protocolFilter, &hostFilter ); URLNormalizer *normalizer = new SimpleURLNormalizer( ); |