summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-09-28 14:30:16 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-09-28 14:30:16 +0200
commitc95c920088e42bd9ffef3d43134fabc28cae3c92 (patch)
tree217d7437955857231d94739c2ffb4a5fd984acaa
parente53f13ccaa3a0b595351fc536bf1a6d6be805175 (diff)
downloadcrawler-c95c920088e42bd9ffef3d43134fabc28cae3c92.tar.gz
crawler-c95c920088e42bd9ffef3d43134fabc28cae3c92.tar.bz2
some testing and stabilizing
-rwxr-xr-xsrc/GNUmakefile17
-rwxr-xr-xsrc/crawl/crawl.cpp19
2 files changed, 30 insertions, 6 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index b339ffc..2a8968a 100755
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -16,5 +16,20 @@ local_uninstall:
local_test:
+MEMCHECK=valgrind -v --leak-check=full --show-reachable=yes --num-callers=50 --suppressions=${HOME}/.valgrind-suppressions
+PERFCHECK=${HOME}/scripts/qcachegrind.sh
+
+LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser
+
run:
- @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser $(TOPDIR)/src/crawl/crawl
+ @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(TOPDIR)/src/crawl/crawl
+
+runmemcheck:
+ @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECK) $(TOPDIR)/src/crawl/crawl
+
+runmemcheckgui:
+ @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) valkyrie $(TOPDIR)/src/crawl/crawl
+
+runperfcheck:
+ LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(PERFCHECK) $(TOPDIR)/src/crawl/crawl
+ \ No newline at end of file
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index c5b893d..f4d8992 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -169,8 +169,11 @@ int main( void )
URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols );
set<string> hosts;
-// hosts.insert( "www.andreasbaumann.cc" );
- hosts.insert( "relevancy.bger.ch" );
+ hosts.insert( "www.andreasbaumann.cc" );
+// hosts.insert( "relevancy.bger.ch" );
+// hosts.insert( "wolframe.net" );
+// hosts.insert( "andreasbaumann.dyndns.org" );
+
URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts );
list<URLFilter *> filters;
@@ -192,8 +195,10 @@ int main( void )
LOG( logNOTICE ) << "Crawler started..";
-// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
- frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) );
+ frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
+// frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) );
+// frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) );
+// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/nzz/" ) );
URL url;
while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) {
@@ -230,10 +235,13 @@ int main( void )
} else if( mimeType == "application/xml" ) {
s->rewind( );
sitemapParser->process( s );
+ } else if( mimeType == "application/pdf" ) {
+ s->rewind( );
+
}
}
- sleep( 2 );
+ //~ sleep( 2 );
#else
htmlParser->process( s );
#endif
@@ -241,6 +249,7 @@ int main( void )
delete s;
}
+ processors.destroy( sitemapParser );
processors.destroy( robotsTxtParser );
processors.destroy( htmlParser );
urlNormalizers.destroy( normalizer );