diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-07-24 13:12:37 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-07-24 13:12:37 +0200 |
commit | aa7fa3ef71b4e9193088b67c9b34448c00a8f949 (patch) | |
tree | 47c88c4189f025a809a61a906bd636e5a05372ed /src/crawl | |
parent | a57788acee59705418b96525410b84fbee2f405a (diff) | |
download | crawler-aa7fa3ef71b4e9193088b67c9b34448c00a8f949.tar.gz crawler-aa7fa3ef71b4e9193088b67c9b34448c00a8f949.tar.bz2 |
sitemap processing (work in progress)
Diffstat (limited to 'src/crawl')
-rwxr-xr-x | src/crawl/crawl.cpp | 16 |
1 files changed, 13 insertions, 3 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index ecc8f16..4899d0f 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -55,7 +55,8 @@ BOOL WINAPI termHandler( DWORD ctrlType ) int main( void ) { try { - Logger::instance( ).openConsoleLog( logINFO ); +// Logger::instance( ).openConsoleLog( logINFO ); + Logger::instance( ).openConsoleLog( logDEBUG ); #ifndef _WIN32 struct sigaction sa; @@ -135,9 +136,11 @@ int main( void ) #ifndef _WIN32 processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" ); + processorModules.push_back( "./modules/processor/sitemap/mod_processor_sitemap.so" ); #else processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" ); + processorModules.push_back( ".\\modules\\processor\\sitemap\\mod_processor_sitemap.dll" ); #endif ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules ); @@ -179,9 +182,12 @@ int main( void ) Processor *htmlParser = processors.create( "htmllinkextract_processor", normalizer, frontier, chainFilter, urlSeen ); - + Processor *robotsTxtParser = processors.create( "robotstxt_processor", normalizer, frontier, chainFilter, urlSeen ); + + Processor *sitemapParser = processors.create( "sitemap_processor", + normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; @@ -206,7 +212,8 @@ int main( void ) #ifndef _WIN32 MIMEType mimeType = typeDetect->detect( s ); - if( mimeType != MIMEType::Null ) { + if( mimeType != MIMEType::Null ) { + LOG( logDEBUG ) << "MIME type of '" << url << "' is '" << mimeType << "'"; if( mimeType == "text/html" ) { s->rewind( ); htmlParser->process( s ); @@ -219,6 +226,9 @@ int main( void ) s->rewind( ); robotsTxtParser->process( s ); } + } else if( mimeType == "text/xml" ) { + s->rewind( ); + sitemapParser->process( s ); } } #else |