summaryrefslogtreecommitdiff
path: root/src/crawlingwolf.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/crawlingwolf.cpp')
-rw-r--r--src/crawlingwolf.cpp39
1 files changed, 27 insertions, 12 deletions
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index e924b16..1c0576f 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -4,10 +4,12 @@
#include "HTMLLinkExtractProcessor.hpp"
#include "MemoryURLSeen.hpp"
#include "URLNormalizer.hpp"
+#include "URLFilter.hpp"
#include "ModuleLoader.hpp"
#include <set>
#include <vector>
+#include <list>
using namespace std;
@@ -15,32 +17,42 @@ int main( void )
{
FILELog::reportingLevel( ) = logINFO;
- vector<string> modules;
- modules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
- modules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
- ModuleLoader<URLNormalizer> urlNormalizers( modules );
+ vector<string> normalizerModules;
+ normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
+ normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
+ ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules );
+
+ vector<string> filterModules;
+ filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" );
+ filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" );
+ ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules );
+
+ vector<string> filterChainModules;
+ filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" );
+ ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules );
Frontier *frontier = new MemoryFrontier( );
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
URLSeen *urlSeen = new MemoryURLSeen( );
-/*
- set<string> protocols;
+ set<string> protocols;
protocols.insert( "http" );
protocols.insert( "https" );
- ProtocolURLFilter protocolFilter( protocols );
+ URLFilter *protocolFilter = urlFilters.create( "protocol", protocols );
set<string> hosts;
hosts.insert( "www.andreasbaumann.cc" );
- HostURLFilter hostFilter( hosts );
+ URLFilter *hostFilter = urlFilters.create( "host", hosts );
- ChainURLFilter filters( &protocolFilter, &hostFilter );
-*/
+ list<URLFilter *> filters;
+ filters.push_back( hostFilter );
+ filters.push_back( protocolFilter );
+ URLFilter *chainFilter = urlChainFilter.create( "chain", filters );
+
URLNormalizer *normalizer = urlNormalizers.create( "google" );
- //Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen );
- Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, std::list( ), urlSeen );
+ Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, chainFilter, urlSeen );
LOG( logNOTICE ) << "Crawler started..";
@@ -64,6 +76,9 @@ int main( void )
delete processor;
urlNormalizers.destroy( normalizer );
+ urlChainFilter.destroy( chainFilter );
+ urlFilters.destroy( protocolFilter );
+ urlFilters.destroy( hostFilter );
delete urlSeen;
delete deduper;
delete fetcher;