diff options
Diffstat (limited to 'src/crawlingwolf.cpp')
-rw-r--r-- | src/crawlingwolf.cpp | 39 |
1 files changed, 27 insertions, 12 deletions
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index e924b16..1c0576f 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -4,10 +4,12 @@ #include "HTMLLinkExtractProcessor.hpp" #include "MemoryURLSeen.hpp" #include "URLNormalizer.hpp" +#include "URLFilter.hpp" #include "ModuleLoader.hpp" #include <set> #include <vector> +#include <list> using namespace std; @@ -15,32 +17,42 @@ int main( void ) { FILELog::reportingLevel( ) = logINFO; - vector<string> modules; - modules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); - modules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); - ModuleLoader<URLNormalizer> urlNormalizers( modules ); + vector<string> normalizerModules; + normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); + normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); + ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules ); + + vector<string> filterModules; + filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); + filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); + ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules ); + + vector<string> filterChainModules; + filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); + ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules ); Frontier *frontier = new MemoryFrontier( ); Fetcher *fetcher = new LibFetchFetcher( ); Deduper *deduper = new MD5Deduper( ); URLSeen *urlSeen = new MemoryURLSeen( ); -/* - set<string> protocols; + set<string> protocols; protocols.insert( "http" ); protocols.insert( "https" ); - ProtocolURLFilter protocolFilter( protocols ); + URLFilter *protocolFilter = urlFilters.create( "protocol", protocols ); set<string> hosts; hosts.insert( "www.andreasbaumann.cc" ); - HostURLFilter hostFilter( hosts ); + URLFilter *hostFilter = urlFilters.create( "host", hosts ); - ChainURLFilter filters( &protocolFilter, &hostFilter ); -*/ + list<URLFilter *> filters; + filters.push_back( hostFilter ); + filters.push_back( protocolFilter ); + URLFilter *chainFilter = urlChainFilter.create( "chain", filters ); + URLNormalizer *normalizer = urlNormalizers.create( "google" ); - //Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen ); - Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, std::list( ), urlSeen ); + Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; @@ -64,6 +76,9 @@ int main( void ) delete processor; urlNormalizers.destroy( normalizer ); + urlChainFilter.destroy( chainFilter ); + urlFilters.destroy( protocolFilter ); + urlFilters.destroy( hostFilter ); delete urlSeen; delete deduper; delete fetcher; |