summaryrefslogtreecommitdiff
path: root/src/crawl/crawl.cpp
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2014-07-24 13:12:37 +0200
committerAndreas Baumann <abaumann@yahoo.com>2014-07-24 13:12:37 +0200
commitaa7fa3ef71b4e9193088b67c9b34448c00a8f949 (patch)
tree47c88c4189f025a809a61a906bd636e5a05372ed /src/crawl/crawl.cpp
parenta57788acee59705418b96525410b84fbee2f405a (diff)
downloadcrawler-aa7fa3ef71b4e9193088b67c9b34448c00a8f949.tar.gz
crawler-aa7fa3ef71b4e9193088b67c9b34448c00a8f949.tar.bz2
sitemap processing (work in progress)
Diffstat (limited to 'src/crawl/crawl.cpp')
-rwxr-xr-xsrc/crawl/crawl.cpp16
1 files changed, 13 insertions, 3 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index ecc8f16..4899d0f 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -55,7 +55,8 @@ BOOL WINAPI termHandler( DWORD ctrlType )
int main( void )
{
try {
- Logger::instance( ).openConsoleLog( logINFO );
+// Logger::instance( ).openConsoleLog( logINFO );
+ Logger::instance( ).openConsoleLog( logDEBUG );
#ifndef _WIN32
struct sigaction sa;
@@ -135,9 +136,11 @@ int main( void )
#ifndef _WIN32
processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" );
processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" );
+ processorModules.push_back( "./modules/processor/sitemap/mod_processor_sitemap.so" );
#else
processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" );
processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" );
+ processorModules.push_back( ".\\modules\\processor\\sitemap\\mod_processor_sitemap.dll" );
#endif
ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules );
@@ -179,9 +182,12 @@ int main( void )
Processor *htmlParser = processors.create( "htmllinkextract_processor",
normalizer, frontier, chainFilter, urlSeen );
-
+
Processor *robotsTxtParser = processors.create( "robotstxt_processor",
normalizer, frontier, chainFilter, urlSeen );
+
+ Processor *sitemapParser = processors.create( "sitemap_processor",
+ normalizer, frontier, chainFilter, urlSeen );
LOG( logNOTICE ) << "Crawler started..";
@@ -206,7 +212,8 @@ int main( void )
#ifndef _WIN32
MIMEType mimeType = typeDetect->detect( s );
- if( mimeType != MIMEType::Null ) {
+ if( mimeType != MIMEType::Null ) {
+ LOG( logDEBUG ) << "MIME type of '" << url << "' is '" << mimeType << "'";
if( mimeType == "text/html" ) {
s->rewind( );
htmlParser->process( s );
@@ -219,6 +226,9 @@ int main( void )
s->rewind( );
robotsTxtParser->process( s );
}
+ } else if( mimeType == "text/xml" ) {
+ s->rewind( );
+ sitemapParser->process( s );
}
}
#else