diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2014-09-28 14:30:16 +0200 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2014-09-28 14:30:16 +0200 |
commit | c95c920088e42bd9ffef3d43134fabc28cae3c92 (patch) | |
tree | 217d7437955857231d94739c2ffb4a5fd984acaa | |
parent | e53f13ccaa3a0b595351fc536bf1a6d6be805175 (diff) | |
download | crawler-c95c920088e42bd9ffef3d43134fabc28cae3c92.tar.gz crawler-c95c920088e42bd9ffef3d43134fabc28cae3c92.tar.bz2 |
some testing and stabilizing
-rwxr-xr-x | src/GNUmakefile | 17 | ||||
-rwxr-xr-x | src/crawl/crawl.cpp | 19 |
2 files changed, 30 insertions, 6 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile index b339ffc..2a8968a 100755 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -16,5 +16,20 @@ local_uninstall: local_test: +MEMCHECK=valgrind -v --leak-check=full --show-reachable=yes --num-callers=50 --suppressions=${HOME}/.valgrind-suppressions +PERFCHECK=${HOME}/scripts/qcachegrind.sh + +LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser + run: - @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser $(TOPDIR)/src/crawl/crawl + @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(TOPDIR)/src/crawl/crawl + +runmemcheck: + @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECK) $(TOPDIR)/src/crawl/crawl + +runmemcheckgui: + @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) valkyrie $(TOPDIR)/src/crawl/crawl + +runperfcheck: + LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(PERFCHECK) $(TOPDIR)/src/crawl/crawl +
\ No newline at end of file diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index c5b893d..f4d8992 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -169,8 +169,11 @@ int main( void ) URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set<string> hosts; -// hosts.insert( "www.andreasbaumann.cc" ); - hosts.insert( "relevancy.bger.ch" ); + hosts.insert( "www.andreasbaumann.cc" ); +// hosts.insert( "relevancy.bger.ch" ); +// hosts.insert( "wolframe.net" ); +// hosts.insert( "andreasbaumann.dyndns.org" ); + URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); list<URLFilter *> filters; @@ -192,8 +195,10 @@ int main( void ) LOG( logNOTICE ) << "Crawler started.."; -// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); - frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); + frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/nzz/" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { @@ -230,10 +235,13 @@ int main( void ) } else if( mimeType == "application/xml" ) { s->rewind( ); sitemapParser->process( s ); + } else if( mimeType == "application/pdf" ) { + s->rewind( ); + } } - sleep( 2 ); + //~ sleep( 2 ); #else htmlParser->process( s ); #endif @@ -241,6 +249,7 @@ int main( void ) delete s; } + processors.destroy( sitemapParser ); processors.destroy( robotsTxtParser ); processors.destroy( htmlParser ); urlNormalizers.destroy( normalizer ); |