summaryrefslogtreecommitdiff
path: root/src/crawl
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-10-10 11:26:05 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-10-10 11:26:05 +0200
commitc5351f58bcf494a56ecfd17fe5e68eb3b17dac7d (patch)
treebca845c55a2a7f407a856950a6293f0759d2e391 /src/crawl
parent5382e843f651f834c1df31bd494ed0638be07960 (diff)
downloadcrawler-c5351f58bcf494a56ecfd17fe5e68eb3b17dac7d.tar.gz
crawler-c5351f58bcf494a56ecfd17fe5e68eb3b17dac7d.tar.bz2
added execution function to LuaVm
reorganized main lua.conf and calling several functions
Diffstat (limited to 'src/crawl')
-rw-r--r--src/crawl/crawl.conf33
-rwxr-xr-xsrc/crawl/crawl.cpp31
2 files changed, 45 insertions, 19 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf
index 7c3fb80..816f23f 100644
--- a/src/crawl/crawl.conf
+++ b/src/crawl/crawl.conf
@@ -1,17 +1,12 @@
-local normalizer = GoogleURLNormalizer:new( )
-local baseUrl = normalizer:parseUrl( "http://www.base.com" )
-io.write( "base URL is: " .. baseUrl:str( ) .. "\n" )
-local url = normalizer:normalize( baseUrl, "/relativedir/relativefile.html" )
-io.write( "URL is: " .. url:str( ) .. "\n" )
-normalizer:delete( )
-
-- global setting
crawler = {
-- stop after N documents
- stop_after_N_operations = 10
+ stop_after_N_operations = 10,
+ module_path = modules,
+ modules_search_recursive = true
}
logger = {
@@ -25,6 +20,11 @@ seeds = {
"http://wolframe.net"
}
+urlnormalizers = {
+ "mod_normalizer_simple",
+ "mod_normalizer_google"
+}
+
filters = {
-- allowed protocols to be fetched
protocols = {
@@ -39,6 +39,21 @@ filters = {
}
}
+
+function init( )
+ io.write( "Init..\n" )
+ normalizer = GoogleURLNormalizer:new( )
+end
+
+function destroy( )
+ io.write( "Destroy..\n" )
+ normalizer:delete( )
+end
+
function crawl( )
- io.write( "Crawling." )
+ io.write( "Crawling..\n" )
+ local baseUrl = normalizer:parseUrl( "http://www.base.com" )
+ io.write( "base URL is: " .. baseUrl:str( ) .. "\n" )
+ local url = normalizer:normalize( baseUrl, "/relativedir/relativefile.html" )
+ io.write( "URL is: " .. url:str( ) .. "\n" )
end
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index 3a8fdff..e26de0a 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -71,8 +71,11 @@ int main( int /* argc */, char *argv[] )
// Logger::instance( ).openConsoleLog( logINFO );
Logger::instance( ).openConsoleLog( logDEBUG );
+ // load configuration (Lua) and execute main (to
+ // get basic configuration in form of global
+ // variables)
luaVm.loadSource( argv[1] );
- //luaVm.executeMain( );
+ luaVm.executeMain( );
#ifndef _WIN32
struct sigaction sa;
@@ -86,6 +89,7 @@ int main( int /* argc */, char *argv[] )
SetConsoleCtrlHandler( termHandler, TRUE );
#endif
+ // go through all type of modules and load them with the proper loader
LOG( logNOTICE ) << "Loading modules";
vector<string> normalizerModules;
@@ -98,6 +102,22 @@ int main( int /* argc */, char *argv[] )
#endif
ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules, CLOSE_DEFERRED, (void *)&luaVm );
+#ifdef WITH_LUA
+ // TODO: should be in the laoding function of libcrawl
+ tolua_URL_open( luaVm.handle( ) );
+#endif
+
+ // initialize crawler function
+ luaVm.executeFunction( "init" );
+
+ // perform a crawl step
+ luaVm.executeFunction( "crawl" );
+
+ // cleaning up
+ luaVm.executeFunction( "destroy" );
+
+ return 0;
+
vector<string> filterModules;
#ifndef _WIN32
filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" );
@@ -287,15 +307,6 @@ int main( int /* argc */, char *argv[] )
LOG( logNOTICE ) << "Crawler stopped.. normal shutdown..";
-#ifdef WITH_LUA
- // TODO: should be in the laoding function of libcrawl
- tolua_URL_open( luaVm.handle( ) );
-#endif
-
- luaVm.executeMain( );
- //luaVm.dumpState( );
-
- return 0;
} catch( exception &e ) {
LOG( logFATAL ) << "Crawler stopped: " << e.what( );
return 1;