summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-09-28 21:29:03 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-09-28 21:29:03 +0200
commitb80687f17644766eb890598297c0f37bb898d76d (patch)
tree44e6a15cc058087a19dd44d44c2d1d52194a5876
parentc82a15eb0ffe61c1d2d2630981777f72013e833a (diff)
downloadcrawler-b80687f17644766eb890598297c0f37bb898d76d.tar.gz
crawler-b80687f17644766eb890598297c0f37bb898d76d.tar.bz2
first Lua config of crawler
-rwxr-xr-xinclude/luaglue/LuaVM.hpp2
-rwxr-xr-xsrc/GNUmakefile11
-rw-r--r--src/crawl/crawl.conf13
-rwxr-xr-xsrc/crawl/crawl.cpp23
-rw-r--r--src/libluaglue/LuaVM.cpp22
5 files changed, 59 insertions, 12 deletions
diff --git a/include/luaglue/LuaVM.hpp b/include/luaglue/LuaVM.hpp
index 7449b3f..48faaeb 100755
--- a/include/luaglue/LuaVM.hpp
+++ b/include/luaglue/LuaVM.hpp
@@ -9,6 +9,8 @@ class LuaVM
LuaVM( );
~LuaVM( );
+ void loadSource( const char *filename );
+
private:
void initialize( );
diff --git a/src/GNUmakefile b/src/GNUmakefile
index a29d3a7..24522c2 100755
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -18,23 +18,24 @@ local_test:
MEMCHECK=valgrind -v --leak-check=full --show-reachable=yes --num-callers=50 --suppressions=${HOME}/.valgrind-suppressions
MEMCHECKLEARN=$(MEMCHECK) --gen-suppressions=all
+VALKYRIE=valkyrie
PERFCHECK=${HOME}/scripts/qcachegrind.sh
LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser:$(TOPDIR)/src/libluaglue
run:
- @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(TOPDIR)/src/crawl/crawl
+ LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf
runmemcheck:
- @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECK) $(TOPDIR)/src/crawl/crawl
+ @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECK) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf
runmemchecklearn:
- @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECKLEARN) $(TOPDIR)/src/crawl/crawl
+ @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(MEMCHECKLEARN) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf
runmemcheckgui:
- @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) valkyrie $(TOPDIR)/src/crawl/crawl
+ LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(VALKYRIE) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf
runperfcheck:
- LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(PERFCHECK) $(TOPDIR)/src/crawl/crawl
+ @LD_LIBRARY_PATH=$(LD_LIBRARY_PATH) $(PERFCHECK) $(TOPDIR)/src/crawl/crawl $(TOPDIR)/src/crawl/crawl.conf
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf
new file mode 100644
index 0000000..95d0f75
--- /dev/null
+++ b/src/crawl/crawl.conf
@@ -0,0 +1,13 @@
+-- global setting
+
+crawler = {
+ -- stop after N documents
+ stop_after_N_operations = 10
+}
+
+-- seeds: URLS which are fed in the beginning to the URL frontier
+
+seeds = {
+ "http://www.wolframe.net",
+ "http://wolframe.net"
+}
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index 947976f..f10075f 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -55,14 +55,18 @@ BOOL WINAPI termHandler( DWORD ctrlType )
#endif
-int main( void )
+static int counter = 0;
+
+int main( int /* argc */, char *argv[] )
{
try {
LuaVM luaVm;
-
+
// Logger::instance( ).openConsoleLog( logINFO );
Logger::instance( ).openConsoleLog( logDEBUG );
+ luaVm.loadSource( argv[1] );
+
#ifndef _WIN32
struct sigaction sa;
memset( &sa, 0, sizeof( struct sigaction ) );
@@ -173,10 +177,10 @@ int main( void )
URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols );
set<string> hosts;
- hosts.insert( "www.andreasbaumann.cc" );
+ hosts.insert( "andreasbaumann.dyndns.org" );
+// hosts.insert( "www.andreasbaumann.cc" );
// hosts.insert( "relevancy.bger.ch" );
// hosts.insert( "wolframe.net" );
-// hosts.insert( "andreasbaumann.dyndns.org" );
URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts );
@@ -199,10 +203,10 @@ int main( void )
LOG( logNOTICE ) << "Crawler started..";
- frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
+// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
// frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) );
// frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) );
-// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/nzz/" ) );
+ frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) );
URL url;
while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) {
@@ -246,6 +250,10 @@ int main( void )
}
//~ sleep( 2 );
+ counter++;
+ if( counter > 10 ) {
+ term = true;
+ }
#else
htmlParser->process( s );
#endif
@@ -274,5 +282,8 @@ int main( void )
} catch( exception &e ) {
LOG( logFATAL ) << "Crawler stopped: " << e.what( );
return 1;
+ } catch( ... ) {
+ LOG( logFATAL ) << "Crawler stopped due to unknown exception!";
+ return 1;
}
}
diff --git a/src/libluaglue/LuaVM.cpp b/src/libluaglue/LuaVM.cpp
index f268823..9363cac 100644
--- a/src/libluaglue/LuaVM.cpp
+++ b/src/libluaglue/LuaVM.cpp
@@ -1,6 +1,11 @@
#include "LuaVM.hpp"
-LuaVM::LuaVM( )
+#include <stdexcept>
+#include <sstream>
+
+using namespace std;
+
+LuaVM::LuaVM( ) : m_lua( 0 )
{
initialize( );
}
@@ -13,4 +18,19 @@ LuaVM::~LuaVM( )
void LuaVM::initialize( )
{
m_lua = luaL_newstate( );
+
+ luaL_openlibs( m_lua );
+}
+
+void LuaVM::loadSource( const char *filename )
+{
+ int res;
+
+ res = luaL_loadfile( m_lua, filename );
+ if( res != 0 ) {
+ ostringstream ss;
+ ss << "Can't read Lua source file from file '" << filename << "': " << lua_tostring( m_lua, -1 );
+ lua_pop( m_lua, 1 );
+ throw std::runtime_error( ss.str( ) );
+ }
}