diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-15 19:51:01 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-15 19:51:01 +0200 |
commit | 69f32e239eafcfdc2392a8644c6de6ca6dbe83c1 (patch) | |
tree | 1f1776b2317df13f3a02069ca8b2ebbfb2ab2f80 | |
parent | bcb4f8f59bc1c522d2c677d8cffb4adbff45aa26 (diff) | |
download | crawler-69f32e239eafcfdc2392a8644c6de6ca6dbe83c1.tar.gz crawler-69f32e239eafcfdc2392a8644c6de6ca6dbe83c1.tar.bz2 |
started to add URL filters
-rw-r--r-- | makefiles/gmake/help.mk | 8 | ||||
-rw-r--r-- | src/ChainURLFilter.cpp | 38 | ||||
-rw-r--r-- | src/ChainURLFilter.hpp | 22 | ||||
-rw-r--r-- | src/DomainURLFilter.cpp | 11 | ||||
-rw-r--r-- | src/DomainURLFilter.hpp | 19 | ||||
-rw-r--r-- | src/GNUmakefile | 5 | ||||
-rw-r--r-- | src/HTMLLinkExtractProcessor.cpp | 30 | ||||
-rw-r--r-- | src/HTMLLinkExtractProcessor.hpp | 4 | ||||
-rw-r--r-- | src/ProtocolURLFilter.cpp | 11 | ||||
-rw-r--r-- | src/ProtocolURLFilter.hpp | 19 | ||||
-rw-r--r-- | src/RewindInputStream.hpp | 5 | ||||
-rw-r--r-- | src/URL.hpp | 20 | ||||
-rw-r--r-- | src/URLFilter.hpp | 5 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 23 |
14 files changed, 208 insertions, 12 deletions
diff --git a/makefiles/gmake/help.mk b/makefiles/gmake/help.mk index 385d99f..2904c39 100644 --- a/makefiles/gmake/help.mk +++ b/makefiles/gmake/help.mk @@ -36,10 +36,16 @@ WITH_LOCAL_STREAMHTMLPARSER=1 use Google stream HTML 4 parser WITH_LIBXML2=1 build the libxml2 parser +scripting support: + +WITH_LUA=1 use Lua for configuration and scripting + Some more obscure options: ENABLE_NLS=0 Don't build gettext NLS support (default is on) Example: make WITH_SSL=1 WITH_SQLITE3=1 WITH_PGSQL=1 \ - WITH_LOCAL_LIBFETCH=1 WITH_LIBXML2=1 + WITH_LOCAL_LIBFETCH=1 WITH_LIBXML2=1 \ + WITH_LUA=1 + diff --git a/src/ChainURLFilter.cpp b/src/ChainURLFilter.cpp new file mode 100644 index 0000000..e367c14 --- /dev/null +++ b/src/ChainURLFilter.cpp @@ -0,0 +1,38 @@ +#include "ChainURLFilter.hpp" + +ChainURLFilter::ChainURLFilter( ) + : m_filters( ) +{ +} + +ChainURLFilter::ChainURLFilter( URLFilter *f1 ) + : m_filters( ) +{ + m_filters.push_back( f1 ); +} + +ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2 ) + : m_filters( ) +{ + m_filters.push_back( f1 ); + m_filters.push_back( f2 ); +} + +ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 ) + : m_filters( ) +{ + m_filters.push_back( f1 ); + m_filters.push_back( f2 ); + m_filters.push_back( f3 ); +} + +bool ChainURLFilter::filter( const URL &url ) +{ + list<URLFilter *>::const_iterator it; + + for( it = m_filters.begin( ); it != m_filters.end( ); it++ ) { + if( !( (*it)->filter( url ) ) ) return false; + } + + return true; +} diff --git a/src/ChainURLFilter.hpp b/src/ChainURLFilter.hpp new file mode 100644 index 0000000..d6b2580 --- /dev/null +++ b/src/ChainURLFilter.hpp @@ -0,0 +1,22 @@ +#ifndef __CHAIN_URLFILTER_H +#define __CHAIN_URLFILTER_H + +#include "URLFilter.hpp" + +#include <list> + +class ChainURLFilter : public URLFilter +{ + public: + ChainURLFilter( ); + ChainURLFilter( URLFilter *f1 ); + ChainURLFilter( URLFilter *f1, URLFilter *f2 ); + ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 ); + + virtual bool filter( const URL &url ); + + protected: + std::list<URLFilter *> m_filters; +}; + +#endif diff --git a/src/DomainURLFilter.cpp b/src/DomainURLFilter.cpp new file mode 100644 index 0000000..5f42de2 --- /dev/null +++ b/src/DomainURLFilter.cpp @@ -0,0 +1,11 @@ +#include "DomainURLFilter.hpp" + +DomainURLFilter::DomainURLFilter( const std::set<std::string> domains ) + : m_domains( domains ) +{ +} + +bool DomainURLFilter::filter( const URL &url ) +{ + return( m_domains.find( url.domain( ) ) != m_domains.end( ) ); +} diff --git a/src/DomainURLFilter.hpp b/src/DomainURLFilter.hpp new file mode 100644 index 0000000..76dbc73 --- /dev/null +++ b/src/DomainURLFilter.hpp @@ -0,0 +1,19 @@ +#ifndef __DOMAIN_URLLFILTER_H +#define __DOMAIN_URLFILTER_H + +#include "URLFilter.hpp" + +#include <set> + +class DomainURLFilter : public URLFilter +{ + public: + DomainURLFilter( const std::set<std::string> domains ); + + virtual bool filter( const URL &url ); + + protected: + std::set<std::string> m_domains; +}; + +#endif diff --git a/src/GNUmakefile b/src/GNUmakefile index 8108a0c..7ef583b 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -33,7 +33,10 @@ CPP_OBJS = \ LibFetchRewindInputStream.o \ Frontier.o \ Deduper.o \ - HTMLLinkExtractProcessor.o + HTMLLinkExtractProcessor.o \ + ProtocolURLFilter.o \ + DomainURLFilter.o \ + ChainURLFilter.o CPP_BINS = \ crawlingwolf$(EXE) diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/HTMLLinkExtractProcessor.cpp index e956017..17a7b20 100644 --- a/src/HTMLLinkExtractProcessor.cpp +++ b/src/HTMLLinkExtractProcessor.cpp @@ -7,8 +7,8 @@ using namespace std; using namespace streamhtmlparser; -HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier ) - : m_frontier( frontier ), m_parser( ) +HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter ) + : m_frontier( frontier ), m_filter( filter ), m_parser( ) { } @@ -27,17 +27,35 @@ void HTMLLinkExtractProcessor::process( RewindInputStream *s ) m_parser.Parse( buf, 1 ); if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) { - if( strcmp( m_parser.tag( ), "a" ) == 0 && strcmp( m_parser.attribute( ), "href" ) == 0 ) { + if( strcmp( m_parser.tag( ), "base" ) == 0 && + strcmp( m_parser.attribute( ), "href" ) == 0 ) { + s->setBaseUrl( string( m_parser.value( ) ) ); + } + if( ( ( strcmp( m_parser.tag( ), "a" ) == 0 || + strcmp( m_parser.tag( ), "area" ) == 0 || + strcmp( m_parser.tag( ), "link" ) == 0 ) && + strcmp( m_parser.attribute( ), "href" ) == 0 ) || + ( ( strcmp( m_parser.tag( ), "img" ) == 0 || + strcmp( m_parser.tag( ), "frame" ) == 0 || + strcmp( m_parser.tag( ), "iframe" ) == 0 || + strcmp( m_parser.tag( ), "embed" ) == 0 ) && + strcmp( m_parser.attribute( ), "src" ) == 0 ) + ) { link = m_parser.value( ); in_link = true; } } else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) { - if( link.substr( 0, 7 ) == "http://" ) { - m_frontier->addUrl( link ); + if( link.substr( 0, 7 ) == "http://" || + link.substr( 0, 8 ) == "https://" ) { + if( m_filter->filter( link ) ) { + m_frontier->addUrl( link ); + } } else { string absLink( s->getBaseUrl( ).str( ) ); absLink.append( link ); - m_frontier->addUrl( absLink ); + if( m_filter->filter( link ) ) { + m_frontier->addUrl( absLink ); + } } link.clear( ); in_link = false; diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/HTMLLinkExtractProcessor.hpp index 2777521..9d2c579 100644 --- a/src/HTMLLinkExtractProcessor.hpp +++ b/src/HTMLLinkExtractProcessor.hpp @@ -3,17 +3,19 @@ #include "Processor.hpp" #include "Frontier.hpp" +#include "URLFilter.hpp" #include "htmlparser_cpp.h" class HTMLLinkExtractProcessor : public Processor { public: - HTMLLinkExtractProcessor( Frontier *frontier ); + HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter ); virtual ~HTMLLinkExtractProcessor( ); virtual void process( RewindInputStream *s ); protected: Frontier *m_frontier; + URLFilter *m_filter; streamhtmlparser::HtmlParser m_parser; }; diff --git a/src/ProtocolURLFilter.cpp b/src/ProtocolURLFilter.cpp new file mode 100644 index 0000000..57e042d --- /dev/null +++ b/src/ProtocolURLFilter.cpp @@ -0,0 +1,11 @@ +#include "ProtocolURLFilter.hpp" + +ProtocolURLFilter::ProtocolURLFilter( const std::set<std::string> protocols ) + : m_protocols( protocols ) +{ +} + +bool ProtocolURLFilter::filter( const URL &url ) +{ + return( m_protocols.find( url.protocol( ) ) != m_protocols.end( ) ); +} diff --git a/src/ProtocolURLFilter.hpp b/src/ProtocolURLFilter.hpp new file mode 100644 index 0000000..cd05ff9 --- /dev/null +++ b/src/ProtocolURLFilter.hpp @@ -0,0 +1,19 @@ +#ifndef __PROTOCOL_URLFILTER_H +#define __PROTOCOL_URLFILTER_H + +#include "URLFilter.hpp" + +#include <set> + +class ProtocolURLFilter : public URLFilter +{ + public: + ProtocolURLFilter( const std::set<std::string> protocols ); + + bool filter( const URL &url ); + + protected: + std::set<std::string> m_protocols; +}; + +#endif diff --git a/src/RewindInputStream.hpp b/src/RewindInputStream.hpp index 9daafe4..92f2961 100644 --- a/src/RewindInputStream.hpp +++ b/src/RewindInputStream.hpp @@ -17,6 +17,11 @@ class RewindInputStream : public std::istream { return m_baseUrl; } + void setBaseUrl( const URL &url ) + { + m_baseUrl = url; + } + private: URL m_baseUrl; }; diff --git a/src/URL.hpp b/src/URL.hpp index 4031988..9813956 100644 --- a/src/URL.hpp +++ b/src/URL.hpp @@ -32,6 +32,26 @@ class URL { return m_url; } + const string protocol( ) const + { + return "http"; + } + + const string domain( ) const + { + return "www.andreasbaumann.cc"; + } + + unsigned short port( ) const + { + return 80; + } + + const string path( ) const + { + return "/"; + } + static URL Null; bool operator!=( const URL &other ) const { diff --git a/src/URLFilter.hpp b/src/URLFilter.hpp index 83cddea..c48307e 100644 --- a/src/URLFilter.hpp +++ b/src/URLFilter.hpp @@ -3,8 +3,11 @@ #include "URL.hpp" -class URLFilter { +class URLFilter +{ public: + virtual ~URLFilter( ) { }; + virtual bool filter( const URL &url ) = 0; }; diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index e96a855..2f4e067 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -2,18 +2,37 @@ #include "MemoryFrontier.hpp" #include "MD5Deduper.hpp" #include "HTMLLinkExtractProcessor.hpp" +#include "ChainURLFilter.hpp" +#include "ProtocolURLFilter.hpp" +#include "DomainURLFilter.hpp" + +#include <set> + +using namespace std; int main( void ) { Frontier *frontier = new MemoryFrontier( ); Fetcher *fetcher = new LibFetchFetcher( ); Deduper *deduper = new MD5Deduper( ); - Processor *processor = new HTMLLinkExtractProcessor( frontier ); + + set<string> protocols; + protocols.insert( "http" ); + protocols.insert( "https" ); + ProtocolURLFilter protocolFilter( protocols ); + + set<string> domains; + domains.insert( "www.andreasbaumann.cc" ); + DomainURLFilter domainFilter( domains ); + + ChainURLFilter filters( &protocolFilter, &domainFilter ); + + Processor *processor = new HTMLLinkExtractProcessor( frontier, &filters ); LOG( logNOTICE ) << "Crawler started.."; frontier->addUrl( URL( "http://www.andreasbaumann.cc" ) ); - + URL url; while( ( url = frontier->getNextUrl( ) ) != URL::Null ) { LOG( logINFO ) << "Got URL " << url; |