summaryrefslogtreecommitdiff
path: root/src/crawl/crawl.cpp
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-09-28 14:30:16 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-09-28 14:30:16 +0200
commitc95c920088e42bd9ffef3d43134fabc28cae3c92 (patch)
tree217d7437955857231d94739c2ffb4a5fd984acaa /src/crawl/crawl.cpp
parente53f13ccaa3a0b595351fc536bf1a6d6be805175 (diff)
downloadcrawler-c95c920088e42bd9ffef3d43134fabc28cae3c92.tar.gz
crawler-c95c920088e42bd9ffef3d43134fabc28cae3c92.tar.bz2
some testing and stabilizing
Diffstat (limited to 'src/crawl/crawl.cpp')
-rwxr-xr-xsrc/crawl/crawl.cpp19
1 files changed, 14 insertions, 5 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index c5b893d..f4d8992 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -169,8 +169,11 @@ int main( void )
URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols );
set<string> hosts;
-// hosts.insert( "www.andreasbaumann.cc" );
- hosts.insert( "relevancy.bger.ch" );
+ hosts.insert( "www.andreasbaumann.cc" );
+// hosts.insert( "relevancy.bger.ch" );
+// hosts.insert( "wolframe.net" );
+// hosts.insert( "andreasbaumann.dyndns.org" );
+
URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts );
list<URLFilter *> filters;
@@ -192,8 +195,10 @@ int main( void )
LOG( logNOTICE ) << "Crawler started..";
-// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
- frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) );
+ frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
+// frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) );
+// frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) );
+// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/nzz/" ) );
URL url;
while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) {
@@ -230,10 +235,13 @@ int main( void )
} else if( mimeType == "application/xml" ) {
s->rewind( );
sitemapParser->process( s );
+ } else if( mimeType == "application/pdf" ) {
+ s->rewind( );
+
}
}
- sleep( 2 );
+ //~ sleep( 2 );
#else
htmlParser->process( s );
#endif
@@ -241,6 +249,7 @@ int main( void )
delete s;
}
+ processors.destroy( sitemapParser );
processors.destroy( robotsTxtParser );
processors.destroy( htmlParser );
urlNormalizers.destroy( normalizer );