diff options
Diffstat (limited to 'src/crawlingwolf.cpp')
-rw-r--r-- | src/crawlingwolf.cpp | 23 |
1 files changed, 21 insertions, 2 deletions
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index e96a855..2f4e067 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -2,18 +2,37 @@ #include "MemoryFrontier.hpp" #include "MD5Deduper.hpp" #include "HTMLLinkExtractProcessor.hpp" +#include "ChainURLFilter.hpp" +#include "ProtocolURLFilter.hpp" +#include "DomainURLFilter.hpp" + +#include <set> + +using namespace std; int main( void ) { Frontier *frontier = new MemoryFrontier( ); Fetcher *fetcher = new LibFetchFetcher( ); Deduper *deduper = new MD5Deduper( ); - Processor *processor = new HTMLLinkExtractProcessor( frontier ); + + set<string> protocols; + protocols.insert( "http" ); + protocols.insert( "https" ); + ProtocolURLFilter protocolFilter( protocols ); + + set<string> domains; + domains.insert( "www.andreasbaumann.cc" ); + DomainURLFilter domainFilter( domains ); + + ChainURLFilter filters( &protocolFilter, &domainFilter ); + + Processor *processor = new HTMLLinkExtractProcessor( frontier, &filters ); LOG( logNOTICE ) << "Crawler started.."; frontier->addUrl( URL( "http://www.andreasbaumann.cc" ) ); - + URL url; while( ( url = frontier->getNextUrl( ) ) != URL::Null ) { LOG( logINFO ) << "Got URL " << url; |