diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2014-09-28 14:30:16 +0200 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2014-09-28 14:30:16 +0200 |
commit | c95c920088e42bd9ffef3d43134fabc28cae3c92 (patch) | |
tree | 217d7437955857231d94739c2ffb4a5fd984acaa /src/crawl/crawl.cpp | |
parent | e53f13ccaa3a0b595351fc536bf1a6d6be805175 (diff) | |
download | crawler-c95c920088e42bd9ffef3d43134fabc28cae3c92.tar.gz crawler-c95c920088e42bd9ffef3d43134fabc28cae3c92.tar.bz2 |
some testing and stabilizing
Diffstat (limited to 'src/crawl/crawl.cpp')
-rwxr-xr-x | src/crawl/crawl.cpp | 19 |
1 files changed, 14 insertions, 5 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index c5b893d..f4d8992 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -169,8 +169,11 @@ int main( void ) URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set<string> hosts; -// hosts.insert( "www.andreasbaumann.cc" ); - hosts.insert( "relevancy.bger.ch" ); + hosts.insert( "www.andreasbaumann.cc" ); +// hosts.insert( "relevancy.bger.ch" ); +// hosts.insert( "wolframe.net" ); +// hosts.insert( "andreasbaumann.dyndns.org" ); + URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); list<URLFilter *> filters; @@ -192,8 +195,10 @@ int main( void ) LOG( logNOTICE ) << "Crawler started.."; -// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); - frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); + frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/nzz/" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { @@ -230,10 +235,13 @@ int main( void ) } else if( mimeType == "application/xml" ) { s->rewind( ); sitemapParser->process( s ); + } else if( mimeType == "application/pdf" ) { + s->rewind( ); + } } - sleep( 2 ); + //~ sleep( 2 ); #else htmlParser->process( s ); #endif @@ -241,6 +249,7 @@ int main( void ) delete s; } + processors.destroy( sitemapParser ); processors.destroy( robotsTxtParser ); processors.destroy( htmlParser ); urlNormalizers.destroy( normalizer ); |