From b80687f17644766eb890598297c0f37bb898d76d Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Sun, 28 Sep 2014 21:29:03 +0200 Subject: first Lua config of crawler --- include/luaglue/LuaVM.hpp | 2 ++ src/GNUmakefile | 11 ++++++----- src/crawl/crawl.conf | 13 +++++++++++++ src/crawl/crawl.cpp | 23 +++++++++++++++++------ src/libluaglue/LuaVM.cpp | 22 +++++++++++++++++++++- 5 files changed, 59 insertions(+), 12 deletions(-) create mode 100644 src/crawl/crawl.conf diff --git a/include/luaglue/LuaVM.hpp b/include/luaglue/LuaVM.hpp index 7449b3f..48faaeb 100755 --- a/include/luaglue/LuaVM.hpp +++ b/include/luaglue/LuaVM.hpp @@ -9,6 +9,8 @@ class LuaVM LuaVM( ); ~LuaVM( ); + void loadSource( const char *filename ); + private: void initialize( ); diff --git a/src/GNUmakefile b/src/GNUmakefile index a29d3a7..24522c2 100755 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -18,23 +18,24 @@ local_test: MEMCHECK=valgrind -v --leak-check=full --show-reachable=yes --num-callers=50 --suppressions=${HOME}/.valgrind-suppressions MEMCHECKLEARN=$(MEMCHECK) --gen-suppressions=all +VALKYRIE=valkyrie PERFCHECK=${HOME}/scripts/qcachegrind.sh LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser:$(TOPDIR)/src/libluaglue run: - @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(TOPDIR)/src/crawl/crawl + LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf runmemcheck: - @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECK) $(TOPDIR)/src/crawl/crawl + @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECK) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf runmemchecklearn: - @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECKLEARN) $(TOPDIR)/src/crawl/crawl + @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECKLEARN) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf runmemcheckgui: - @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) valkyrie $(TOPDIR)/src/crawl/crawl + LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(VALKYRIE) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf runperfcheck: - LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(PERFCHECK) $(TOPDIR)/src/crawl/crawl + @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(PERFCHECK) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf new file mode 100644 index 0000000..95d0f75 --- /dev/null +++ b/src/crawl/crawl.conf @@ -0,0 +1,13 @@ +-- global setting + +crawler = { + -- stop after N documents + stop_after_N_operations = 10 +} + +-- seeds: URLS which are fed in the beginning to the URL frontier + +seeds = { + "http://www.wolframe.net", + "http://wolframe.net" +} diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index 947976f..f10075f 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -55,14 +55,18 @@ BOOL WINAPI termHandler( DWORD ctrlType ) #endif -int main( void ) +static int counter = 0; + +int main( int /* argc */, char *argv[] ) { try { LuaVM luaVm; - + // Logger::instance( ).openConsoleLog( logINFO ); Logger::instance( ).openConsoleLog( logDEBUG ); + luaVm.loadSource( argv[1] ); + #ifndef _WIN32 struct sigaction sa; memset( &sa, 0, sizeof( struct sigaction ) ); @@ -173,10 +177,10 @@ int main( void ) URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set hosts; - hosts.insert( "www.andreasbaumann.cc" ); + hosts.insert( "andreasbaumann.dyndns.org" ); +// hosts.insert( "www.andreasbaumann.cc" ); // hosts.insert( "relevancy.bger.ch" ); // hosts.insert( "wolframe.net" ); -// hosts.insert( "andreasbaumann.dyndns.org" ); URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); @@ -199,10 +203,10 @@ int main( void ) LOG( logNOTICE ) << "Crawler started.."; - frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); // frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); // frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) ); -// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/nzz/" ) ); + frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { @@ -246,6 +250,10 @@ int main( void ) } //~ sleep( 2 ); + counter++; + if( counter > 10 ) { + term = true; + } #else htmlParser->process( s ); #endif @@ -274,5 +282,8 @@ int main( void ) } catch( exception &e ) { LOG( logFATAL ) << "Crawler stopped: " << e.what( ); return 1; + } catch( ... ) { + LOG( logFATAL ) << "Crawler stopped due to unknown exception!"; + return 1; } } diff --git a/src/libluaglue/LuaVM.cpp b/src/libluaglue/LuaVM.cpp index f268823..9363cac 100644 --- a/src/libluaglue/LuaVM.cpp +++ b/src/libluaglue/LuaVM.cpp @@ -1,6 +1,11 @@ #include "LuaVM.hpp" -LuaVM::LuaVM( ) +#include +#include + +using namespace std; + +LuaVM::LuaVM( ) : m_lua( 0 ) { initialize( ); } @@ -13,4 +18,19 @@ LuaVM::~LuaVM( ) void LuaVM::initialize( ) { m_lua = luaL_newstate( ); + + luaL_openlibs( m_lua ); +} + +void LuaVM::loadSource( const char *filename ) +{ + int res; + + res = luaL_loadfile( m_lua, filename ); + if( res != 0 ) { + ostringstream ss; + ss << "Can't read Lua source file from file '" << filename << "': " << lua_tostring( m_lua, -1 ); + lua_pop( m_lua, 1 ); + throw std::runtime_error( ss.str( ) ); + } } -- cgit v1.2.3-54-g00ecf