From a29f0c14ed938c399531b696e93208044e2c6e07 Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Sun, 29 Jul 2012 13:08:00 +0200 Subject: temporarily removed domain, domain filter is a host filter now --- src/DomainURLFilter.cpp | 21 --------------------- src/DomainURLFilter.hpp | 19 ------------------- src/GNUmakefile | 2 +- src/HostURLFilter.cpp | 21 +++++++++++++++++++++ src/HostURLFilter.hpp | 19 +++++++++++++++++++ src/SimpleURLNormalizer.cpp | 29 ++++++++++++++++------------- src/URL.hpp | 33 ++++++++++++++++++++++----------- src/crawlingwolf.cpp | 10 +++++----- tests/url/test1.cpp | 3 ++- tests/url/test2.MUST | 3 ++- tests/url/test2.cpp | 8 +++++--- tests/url/test3.MUST | 3 ++- tests/url/test4.MUST | 3 ++- tests/url/test5.MUST | 3 ++- tests/url/test6.MUST | 3 ++- 15 files changed, 101 insertions(+), 79 deletions(-) delete mode 100644 src/DomainURLFilter.cpp delete mode 100644 src/DomainURLFilter.hpp create mode 100644 src/HostURLFilter.cpp create mode 100644 src/HostURLFilter.hpp diff --git a/src/DomainURLFilter.cpp b/src/DomainURLFilter.cpp deleted file mode 100644 index 7eb6560..0000000 --- a/src/DomainURLFilter.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "DomainURLFilter.hpp" -#include "Logger.hpp" - -DomainURLFilter::DomainURLFilter( const std::set domains ) - : m_domains( domains ) -{ -} - -bool DomainURLFilter::filter( const URL url ) -{ - string domain = url.domain( ); - bool res = ( m_domains.find( domain ) != m_domains.end( ) ); - - LOG( logDEBUG ) << "Checking for domain '" << domain << "' in '" << url << "'"; - - LOG( logINFO ) << ( res ? "Including " : "Excluding " ) - << "'" << url << "' " - << "for domain '" << domain << "'"; - - return res; -} diff --git a/src/DomainURLFilter.hpp b/src/DomainURLFilter.hpp deleted file mode 100644 index 637ea67..0000000 --- a/src/DomainURLFilter.hpp +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __DOMAIN_URLLFILTER_H -#define __DOMAIN_URLFILTER_H - -#include "URLFilter.hpp" - -#include - -class DomainURLFilter : public URLFilter -{ - public: - DomainURLFilter( const std::set domains ); - - virtual bool filter( const URL url ); - - protected: - std::set m_domains; -}; - -#endif diff --git a/src/GNUmakefile b/src/GNUmakefile index 3d3d7b8..4abdd22 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -33,7 +33,7 @@ LOCAL_STATIC_LIB_OBJS = \ Deduper.o \ HTMLLinkExtractProcessor.o \ ProtocolURLFilter.o \ - DomainURLFilter.o \ + HostURLFilter.o \ ChainURLFilter.o \ MemoryURLSeen.o \ SimpleURLNormalizer.o diff --git a/src/HostURLFilter.cpp b/src/HostURLFilter.cpp new file mode 100644 index 0000000..181f001 --- /dev/null +++ b/src/HostURLFilter.cpp @@ -0,0 +1,21 @@ +#include "HostURLFilter.hpp" +#include "Logger.hpp" + +HostURLFilter::HostURLFilter( const std::set hosts ) + : m_hosts( hosts ) +{ +} + +bool HostURLFilter::filter( const URL url ) +{ + string host = url.host( ); + bool res = ( m_hosts.find( host ) != m_hosts.end( ) ); + + LOG( logDEBUG ) << "Checking for host '" << host << "' in '" << url << "'"; + + LOG( logINFO ) << ( res ? "Including " : "Excluding " ) + << "'" << url << "' " + << "for host '" << host << "'"; + + return res; +} diff --git a/src/HostURLFilter.hpp b/src/HostURLFilter.hpp new file mode 100644 index 0000000..aa91e09 --- /dev/null +++ b/src/HostURLFilter.hpp @@ -0,0 +1,19 @@ +#ifndef __HOST_URLLFILTER_H +#define __HOST_URLFILTER_H + +#include "URLFilter.hpp" + +#include + +class HostURLFilter : public URLFilter +{ + public: + HostURLFilter( const std::set hosts ); + + virtual bool filter( const URL url ); + + protected: + std::set m_hosts; +}; + +#endif diff --git a/src/SimpleURLNormalizer.cpp b/src/SimpleURLNormalizer.cpp index a74d9e4..f07e39a 100644 --- a/src/SimpleURLNormalizer.cpp +++ b/src/SimpleURLNormalizer.cpp @@ -60,8 +60,9 @@ URL SimpleURLNormalizer::parseUrl( const string s ) string fragment; // TODO: query + string query; - return URL( protocol, host, port, path, fragment ); + return URL( protocol, host, port, path, query, fragment ); } /* @@ -88,21 +89,23 @@ Case normalization – convert all letter at scheme and authority components to Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’ -Scheme-Based Normalization -Add trailing ‘/’ after the authority component of URL -Remove default port number, such as 80 for http scheme -Truncate the fragment of URL - -Protocol-Based Normalization -Only appropriate when the results of accessing the resources are equivalent -For example, example.com/data is directed to example.com/data/ by origin server +domains: +https://github.com/john-kurkowski/tldextract +* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url +* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform +* */ URL SimpleURLNormalizer::normalize( const URL url, const string s ) { - (void)url; - (void)s; - - return URL::Null; + // See if the URL is parseably, if so it is an absolute URL + URL absUrl = parseUrl( s ); + if( absUrl != URL::Null ) { + return absUrl; + } + + // relative links have path, query and fragment only, try to + // append them cleverly + return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" ); } diff --git a/src/URL.hpp b/src/URL.hpp index 58e1d0a..32b1501 100644 --- a/src/URL.hpp +++ b/src/URL.hpp @@ -13,21 +13,22 @@ class URL { string m_host; unsigned short m_port; string m_path; + string m_query; string m_fragment; public: URL( ) - : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_fragment( "" ) + : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_query( "" ), m_fragment( "" ) { } URL( const URL& url ) - : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_fragment( url.m_fragment ) + : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_query( url.m_query ), m_fragment( url.m_fragment ) { } - URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _fragment ) - : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_fragment( _fragment ) + URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _query, const std::string _fragment ) + : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_query( _query ), m_fragment( _fragment ) { } @@ -37,6 +38,7 @@ class URL { this->m_port = u.m_port; this->m_host = u.m_host; this->m_path = u.m_path; + this->m_query = u.m_query; this->m_fragment = u.m_fragment; } return *this; @@ -51,13 +53,7 @@ class URL { { return m_host; } - - const string domain( ) const - { - // TODO: implement using heuristics and top level domain lists - return m_host; - } - + unsigned short port( ) const { return m_port; @@ -68,6 +64,11 @@ class URL { return m_path; } + const string query( ) const + { + return m_query; + } + std::string fragment( ) const { return m_fragment; @@ -88,6 +89,7 @@ class URL { m_host != other.m_host && m_port != other.m_port && m_path != other.m_path && + m_query != other.m_query && m_fragment != other.m_fragment ); } @@ -97,6 +99,7 @@ class URL { m_host == other.m_host && m_port == other.m_port && m_path == other.m_path && + m_query == other.m_query && m_fragment == other.m_fragment ); } @@ -106,6 +109,7 @@ class URL { m_host < other.m_host && m_port < other.m_port && m_path < other.m_path && + m_query < other.m_query && m_fragment < other.m_fragment ); } @@ -128,10 +132,17 @@ inline basic_ostream& operator<<( basic_ostream& } s << u.protocol( ) << "://" << u.host( ); + if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) { s << ":" << u.port( ); } + s << u.path( ); + + if( !u.query( ).empty( ) ) { + s << "?" << u.query( ); + } + if( !u.fragment( ).empty( ) ) { s << "#" << u.fragment( ); } diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index 0028624..2e82ccc 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -4,7 +4,7 @@ #include "HTMLLinkExtractProcessor.hpp" #include "ChainURLFilter.hpp" #include "ProtocolURLFilter.hpp" -#include "DomainURLFilter.hpp" +#include "HostURLFilter.hpp" #include "MemoryURLSeen.hpp" #include "SimpleURLNormalizer.hpp" @@ -26,11 +26,11 @@ int main( void ) protocols.insert( "https" ); ProtocolURLFilter protocolFilter( protocols ); - set domains; - domains.insert( "www.andreasbaumann.cc" ); - DomainURLFilter domainFilter( domains ); + set hosts; + hosts.insert( "www.andreasbaumann.cc" ); + HostURLFilter hostFilter( hosts ); - ChainURLFilter filters( &protocolFilter, &domainFilter ); + ChainURLFilter filters( &protocolFilter, &hostFilter ); URLNormalizer *normalizer = new SimpleURLNormalizer( ); diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp index ac6a086..23c7d74 100644 --- a/tests/url/test1.cpp +++ b/tests/url/test1.cpp @@ -25,9 +25,10 @@ int main( int argc, char *argv[] ) } cout << "protocol: " << url.protocol( ) << endl + << "host: " << url.host( ) << endl << "port: " << url.port( ) << endl - << "domain: " << url.domain( ) << endl << "path: " << url.path( ) << endl + << "query: " << url.query( ) << endl << "fragment: " << url.fragment( ) << endl; cout << "URL: " << url << endl; diff --git a/tests/url/test2.MUST b/tests/url/test2.MUST index 3172868..92158a6 100644 --- a/tests/url/test2.MUST +++ b/tests/url/test2.MUST @@ -1,6 +1,7 @@ protocol: http +host: www.andreasbaumann.cc port: 80 -domain: www.andreasbaumann.cc path: / +query: fragment: URL: http://www.andreasbaumann.cc/ diff --git a/tests/url/test2.cpp b/tests/url/test2.cpp index 7bf135a..4b6aa0d 100644 --- a/tests/url/test2.cpp +++ b/tests/url/test2.cpp @@ -23,11 +23,13 @@ int main( int argc, char *argv[] ) URL url = normalizer->normalize( baseUrl, partialUrlString ); cout << "protocol: " << url.protocol( ) << endl + << "host: " << url.host( ) << endl << "port: " << url.port( ) << endl - << "domain: " << url.domain( ) << endl << "path: " << url.path( ) << endl - << "fragment: " << url.fragment( ) << endl - << endl; + << "query: " << url.query( ) << endl + << "fragment: " << url.fragment( ) << endl; + + cout << "URL: " << url << endl; delete normalizer; diff --git a/tests/url/test3.MUST b/tests/url/test3.MUST index 3172868..92158a6 100644 --- a/tests/url/test3.MUST +++ b/tests/url/test3.MUST @@ -1,6 +1,7 @@ protocol: http +host: www.andreasbaumann.cc port: 80 -domain: www.andreasbaumann.cc path: / +query: fragment: URL: http://www.andreasbaumann.cc/ diff --git a/tests/url/test4.MUST b/tests/url/test4.MUST index f7d1220..0649e10 100644 --- a/tests/url/test4.MUST +++ b/tests/url/test4.MUST @@ -1,6 +1,7 @@ protocol: http +host: www.andreasbaumann.cc port: 80 -domain: www.andreasbaumann.cc path: /index.html +query: fragment: URL: http://www.andreasbaumann.cc/index.html diff --git a/tests/url/test5.MUST b/tests/url/test5.MUST index f7d1220..0649e10 100644 --- a/tests/url/test5.MUST +++ b/tests/url/test5.MUST @@ -1,6 +1,7 @@ protocol: http +host: www.andreasbaumann.cc port: 80 -domain: www.andreasbaumann.cc path: /index.html +query: fragment: URL: http://www.andreasbaumann.cc/index.html diff --git a/tests/url/test6.MUST b/tests/url/test6.MUST index 5ed0d82..de9b556 100644 --- a/tests/url/test6.MUST +++ b/tests/url/test6.MUST @@ -1,6 +1,7 @@ protocol: http +host: www.andreasbaumann.cc port: 8080 -domain: www.andreasbaumann.cc path: /index.html +query: fragment: URL: http://www.andreasbaumann.cc:8080/index.html -- cgit v1.2.3-54-g00ecf