diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2014-10-09 08:59:02 +0200 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2014-10-09 08:59:02 +0200 |
commit | 7d8b1ff684b412da292e0fc734748975188a0f10 (patch) | |
tree | 2673e3da51cc80bfc38a426048b30a4d71c31d4c /src | |
parent | 62c5bb90525baf0d82c23892c2666f611750d63c (diff) | |
download | crawler-7d8b1ff684b412da292e0fc734748975188a0f10.tar.gz crawler-7d8b1ff684b412da292e0fc734748975188a0f10.tar.bz2 |
first trials with a Google normalizer called from Lua, std::string is the problem currently
and the missing wrapper for the URL class
also added a local 'tolua', we will have to hack it
Diffstat (limited to 'src')
-rw-r--r-- | src/crawl/crawl.conf | 4 | ||||
-rwxr-xr-x | src/crawl/crawl.cpp | 7 | ||||
-rw-r--r-- | src/modules/urlnormalizer/googleurl/GNUmakefile | 26 | ||||
-rwxr-xr-x | src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp | 21 | ||||
-rw-r--r-- | src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.pkg | 14 |
5 files changed, 68 insertions, 4 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf index ddc1da6..a524eaf 100644 --- a/src/crawl/crawl.conf +++ b/src/crawl/crawl.conf @@ -1,3 +1,7 @@ +local normalizer = GoogleURLNormalizer:new( ) +local baseUrl = normalizer:parseUrl( "http://www.base.com" ) +-- normalizer:normalize( base, "/relativedir/relativefile.html" ) + -- global setting crawler = { diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index 14a02dd..08d3dbf 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -66,7 +66,7 @@ int main( int /* argc */, char *argv[] ) Logger::instance( ).openConsoleLog( logDEBUG ); luaVm.loadSource( argv[1] ); - luaVm.executeMain( ); + //luaVm.executeMain( ); #ifndef _WIN32 struct sigaction sa; @@ -254,7 +254,7 @@ int main( int /* argc */, char *argv[] ) //~ sleep( 2 ); counter++; - if( counter > 10 ) { + if( counter > 0 ) { term = true; } #else @@ -281,7 +281,8 @@ int main( int /* argc */, char *argv[] ) LOG( logNOTICE ) << "Crawler stopped.. normal shutdown.."; - luaVm.dumpState( ); + luaVm.executeMain( ); + //luaVm.dumpState( ); return 0; } catch( exception &e ) { diff --git a/src/modules/urlnormalizer/googleurl/GNUmakefile b/src/modules/urlnormalizer/googleurl/GNUmakefile index ddd9e73..da181aa 100644 --- a/src/modules/urlnormalizer/googleurl/GNUmakefile +++ b/src/modules/urlnormalizer/googleurl/GNUmakefile @@ -11,8 +11,18 @@ INCLUDE_DIRS = \ -I$(TOPDIR)/include/crawler \ -I$(TOPDIR)/googleurl +ifeq ($(WITH_LUA),1) +INCLUDE_DIRS += \ + -I$(TOPDIR)/include/luaglue +endif + INCLUDE_CXXFLAGS = \ +ifeq ($(WITH_LUA),1) +INCLUDE_CXXFLAGS += \ + -DWITH_LUA +endif + INCLUDE_LDFLAGS = \ -L$(TOPDIR)/googleurl \ -L$(TOPDIR)/src/libcrawler @@ -22,6 +32,11 @@ INCLUDE_LIBS = \ -lcrawler \ -licui18n -licuuc +ifeq ($(WITH_LUA),1) +INCLUDE_LIBS += \ + -ltolua -llua +endif + DYNAMIC_MODULE = \ mod_urlnormalizer_googleurl.so @@ -31,11 +46,22 @@ STATIC_LIB = \ CPP_OBJS = \ GoogleURLNormalizer.o +ifeq ($(WITH_LUA),1) +CPP_OBJS += \ + GoogleURLNormalizerLua.o +endif + -include $(TOPDIR)/makefiles/gmake/sub.mk +GoogleURLNormalizerLua.cpp: GoogleURLNormalizer.pkg + tolua -H GoogleURLNormalizerLua.hpp -o GoogleURLNormalizerLua.cpp GoogleURLNormalizer.pkg + local_all: local_clean: +ifeq ($(WITH_LUA),1) + @-rm GoogleURLNormalizerLua.cpp GoogleURLNormalizerLua.hpp +endif local_distclean: diff --git a/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp index ea04980..734afea 100755 --- a/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp +++ b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp @@ -6,6 +6,12 @@ #include "url_canon_stdstring.h" #include "url_parse.h" +#ifdef WITH_LUA +#include "tolua.h" +#include "GoogleURLNormalizerLua.hpp" +#include "LuaVM.hpp" +#endif + using namespace std; using namespace url_util; using namespace url_canon; @@ -106,4 +112,17 @@ URL GoogleURLNormalizer::normalize( const URL url, const string s ) "" ); } -REGISTER_MODULE( "google_urlnormalizer", 0, 0, URLNormalizer, GoogleURLNormalizer ) +static void initModule( void *user_data ) +{ +#ifdef WITH_LUA + LuaVM *luaVm = (LuaVM *)user_data; + + tolua_GoogleURLNormalizer_open( luaVm->handle( ) ); +#endif +} + +static void destroyModule( void * /* user_data */ ) +{ +} + +REGISTER_MODULE( "google_urlnormalizer", &initModule, &destroyModule, URLNormalizer, GoogleURLNormalizer ) diff --git a/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.pkg b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.pkg new file mode 100644 index 0000000..ca62fe3 --- /dev/null +++ b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.pkg @@ -0,0 +1,14 @@ +$#include "GoogleURLNormalizer.hpp" + +$using std::string; + +class GoogleURLNormalizer : public URLNormalizer +{ + GoogleURLNormalizer( ) { } + + virtual ~GoogleURLNormalizer( ) { } + + virtual URL parseUrl( string s ); + + virtual URL normalize( const URL url, const string s ); +}; |