diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-04-30 16:46:00 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-04-30 16:46:00 +0200 |
commit | 12c50867c04b2c2a11f5026466bbea02d5406b70 (patch) | |
tree | 4008a8d5e3660d823197f97b3c0b244fa37d3ea1 /src/crawl/crawl.cpp | |
parent | eb3771cafb98451116a4f0ec0e7a371800770de1 (diff) | |
download | crawler-12c50867c04b2c2a11f5026466bbea02d5406b70.tar.gz crawler-12c50867c04b2c2a11f5026466bbea02d5406b70.tar.bz2 |
started a robots.txt parser
Diffstat (limited to 'src/crawl/crawl.cpp')
-rwxr-xr-x | src/crawl/crawl.cpp | 18 |
1 files changed, 16 insertions, 2 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index 823ed02..ecc8f16 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -134,8 +134,10 @@ int main( void ) vector<string> processorModules; #ifndef _WIN32 processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); + processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" ); #else processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); + processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" ); #endif ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules ); @@ -163,7 +165,8 @@ int main( void ) URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set<string> hosts; - hosts.insert( "www.andreasbaumann.cc" ); +// hosts.insert( "www.andreasbaumann.cc" ); + hosts.insert( "relevancy.bger.ch" ); URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); list<URLFilter *> filters; @@ -177,9 +180,13 @@ int main( void ) Processor *htmlParser = processors.create( "htmllinkextract_processor", normalizer, frontier, chainFilter, urlSeen ); + Processor *robotsTxtParser = processors.create( "robotstxt_processor", + normalizer, frontier, chainFilter, urlSeen ); + LOG( logNOTICE ) << "Crawler started.."; - frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); + frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { @@ -206,6 +213,12 @@ int main( void ) } else if( mimeType == "application/x-gzip" ) { s->rewind( ); LOG( logINFO ) << "Storing archive " << url; + } else if( mimeType == "text/plain" ) { + if( url.path( ) == "/robots.txt" ) { + LOG( logINFO ) << "Checking " << url.path( ); + s->rewind( ); + robotsTxtParser->process( s ); + } } } #else @@ -215,6 +228,7 @@ int main( void ) delete s; } + processors.destroy( robotsTxtParser ); processors.destroy( htmlParser ); urlNormalizers.destroy( normalizer ); urlChainFilter.destroy( chainFilter ); |