summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-10-16 15:50:30 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-10-16 15:50:30 +0200
commit4ff9da462748b8811cdbe27c7ef3babaa5c119c3 (patch)
tree70309bfce4ea970540352ffad7b191e72a0b488f
parent971d5d22e7117acb95c7903dd5b911b96fc97dcf (diff)
downloadcrawler-4ff9da462748b8811cdbe27c7ef3babaa5c119c3.tar.gz
crawler-4ff9da462748b8811cdbe27c7ef3babaa5c119c3.tar.bz2
testing with two urlnormalizer modules (google and simple) in crawl.conf
-rwxr-xr-xinclude/module/ModuleLoader.hpp8
-rw-r--r--src/crawl/crawl.conf11
-rw-r--r--src/modules/urlnormalizer/simpleurl/GNUmakefile33
-rwxr-xr-xsrc/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp21
4 files changed, 65 insertions, 8 deletions
diff --git a/include/module/ModuleLoader.hpp b/include/module/ModuleLoader.hpp
index b0cddd6..0acc445 100755
--- a/include/module/ModuleLoader.hpp
+++ b/include/module/ModuleLoader.hpp
@@ -232,7 +232,7 @@ class ModuleLoader< Interface, TYPELIST_1( T1 ) > : public BaseModuleLoader< Int
std::string clazz = demangle( typeid( *obj ) );
- BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
+ (void)BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
return obj;
}
@@ -261,7 +261,7 @@ class ModuleLoader< Interface, TYPELIST_2( T1, T2 ) > : public BaseModuleLoader<
std::string clazz = demangle( typeid( *obj ) );
- BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
+ (void)BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
return obj;
}
@@ -290,7 +290,7 @@ class ModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) > : public BaseModuleLoa
std::string clazz = demangle( typeid( *obj ) );
- BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
+ (void)BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
return obj;
}
@@ -319,7 +319,7 @@ class ModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) > : public BaseModul
std::string clazz = demangle( typeid( *obj ) );
- BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
+ (void)BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
return obj;
}
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf
index fd9776f..154d90a 100644
--- a/src/crawl/crawl.conf
+++ b/src/crawl/crawl.conf
@@ -2,7 +2,7 @@
crawler = {
-- stop after N documents
- stop_after_N_operations = 10,
+ stop_after_N_operations = 0,
module_path = "modules",
@@ -82,7 +82,12 @@ filters = {
function init( )
io.write( "Init..\n" )
+ -- normalizer = urlnormalizers.create( "google_urlnormalizer" );
normalizer = GoogleURLNormalizer:new( )
+ -- normalizer2 = urlnormalizers.create( "simple_urlnormalizer" );
+ normalizer2 = SimpleURLNormalizer:new( )
+ base = tolua.cast( normalizer, "URLNormalizer" )
+ io.write( "type: " .. tolua.type( base ) .. "\n" )
end
function destroy( )
@@ -92,8 +97,8 @@ end
function crawl( )
io.write( "Crawling..\n" )
- local baseUrl = normalizer:parseUrl( "http://www.base.com" )
+ local baseUrl = base:parseUrl( "http://www.base.com" )
io.write( "base URL is: " .. baseUrl:str( ) .. "\n" )
- local url = normalizer:normalize( baseUrl, "/relativedir/relativefile.html" )
+ local url = base:normalize( baseUrl, "/relativedir/relativefile.html" )
io.write( "URL is: " .. url:str( ) .. "\n" )
end
diff --git a/src/modules/urlnormalizer/simpleurl/GNUmakefile b/src/modules/urlnormalizer/simpleurl/GNUmakefile
index d7e6378..64053a9 100644
--- a/src/modules/urlnormalizer/simpleurl/GNUmakefile
+++ b/src/modules/urlnormalizer/simpleurl/GNUmakefile
@@ -10,14 +10,36 @@ INCLUDE_DIRS = \
-I$(TOPDIR)/include/util \
-I$(TOPDIR)/include/crawler
+ifeq ($(WITH_LUA),1)
+INCLUDE_DIRS += \
+ -I$(TOPDIR)/include/luaglue \
+ $(TOLUA_INCLUDES)
+endif
+
INCLUDE_CXXFLAGS = \
+ifeq ($(WITH_LUA),1)
+INCLUDE_CXXFLAGS += \
+ -DWITH_LUA
+endif
+
INCLUDE_LDFLAGS = \
-L$(TOPDIR)/src/libcrawler
+ifeq ($(WITH_LUA),1)
+INCLUDE_LDFLAGS += \
+ $(TOLUA_LDFLAGS)
+endif
+
INCLUDE_LIBS = \
-lcrawler
+ifeq ($(WITH_LUA),1)
+INCLUDE_LIBS += \
+ -llua \
+ $(TOLUA_LIBS)
+endif
+
DYNAMIC_MODULE = \
mod_urlnormalizer_simple.so
@@ -27,11 +49,22 @@ STATIC_LIB = \
CPP_OBJS = \
SimpleURLNormalizer.o
+ifeq ($(WITH_LUA),1)
+CPP_OBJS += \
+ SimpleURLNormalizerLua.o
+endif
+
-include $(TOPDIR)/makefiles/gmake/sub.mk
+SimpleURLNormalizerLua.cpp: SimpleURLNormalizer.pkg
+ $(TOLUA) -H SimpleURLNormalizerLua.hpp -o SimpleURLNormalizerLua.cpp SimpleURLNormalizer.pkg
+
local_all:
local_clean:
+ifeq ($(WITH_LUA),1)
+ @-rm SimpleURLNormalizerLua.cpp SimpleURLNormalizerLua.hpp
+endif
local_distclean:
diff --git a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
index 8a7efd7..0ab1d05 100755
--- a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
+++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
@@ -3,6 +3,12 @@
#include "SimpleURLNormalizer.hpp"
+#ifdef WITH_LUA
+#include "tolua.h"
+#include "SimpleURLNormalizerLua.hpp"
+#include "LuaVM.hpp"
+#endif
+
using namespace std;
SimpleURLNormalizer::SimpleURLNormalizer( )
@@ -150,4 +156,17 @@ void SimpleURLNormalizer::normalizePath( string &path )
}
}
-REGISTER_MODULE( "simple_urlnormalizer", 0, 0, URLNormalizer, SimpleURLNormalizer )
+static void initModule( void *user_data )
+{
+#ifdef WITH_LUA
+ LuaVM *luaVm = (LuaVM *)user_data;
+
+ tolua_SimpleURLNormalizer_open( luaVm->handle( ) );
+#endif
+}
+
+static void destroyModule( void * /* user_data */ )
+{
+}
+
+REGISTER_MODULE( "simple_urlnormalizer", &initModule, &destroyModule, URLNormalizer, SimpleURLNormalizer )