From 971d5d22e7117acb95c7903dd5b911b96fc97dcf Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Thu, 16 Oct 2014 13:29:01 +0200 Subject: creating all module constructors now from Lua configuration --- src/crawl/crawl.conf | 38 +++++++++++ src/crawl/crawl.cpp | 175 ++++++++++++++++++++++----------------------------- 2 files changed, 114 insertions(+), 99 deletions(-) diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf index bfcd07b..fd9776f 100644 --- a/src/crawl/crawl.conf +++ b/src/crawl/crawl.conf @@ -17,7 +17,45 @@ modules = { urlnormalizers = { "mod_urlnormalizer_simple", "mod_urlnormalizer_googleurl" + }, + + urlfilters = { + "mod_urlfilter_host", + "mod_urlfilter_protocol" + }, + + urlchainfilters = { + "mod_urlfilter_chain" + }, + + urlfrontiers = { + "mod_frontier_memory" + }, + + fetchers = { + "mod_fetcher_libfetch", + "mod_fetcher_libcurl", + "mod_fetcher_winhttp" + }, + + urlseens = { + "mod_urlseen_memory" + }, + + dedupers = { + "mod_deduper_null" + }, + + processors = { + "mod_processor_htmllinkextract", + "mod_processor_robotstxt", + "mod_processor_sitemap" + }, + + typedetects = { + "mod_typedetect_libmagic" } + } -- seeds: URLS which are fed in the beginning to the URL frontier diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index 7d6622e..77b6876 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -61,6 +61,35 @@ BOOL WINAPI termHandler( DWORD ctrlType ) static int counter = 0; +static vector searchModuleFiles( const vector &modules, const vector &entries ) +{ + vector moduleFiles; + vector::const_iterator it2, end2 = modules.end( ); + for( it2 = modules.begin( ); it2 != end2; it2++ ) { +#ifndef _WIN32 + string module = *it2 + ".so"; +#else + string module = *it2 + ".dll"; +#endif + cout << "Searching for module '" << module << "'" << endl; + + bool found = false; + vector::const_iterator it, end = entries.end( ); + for( it = entries.begin( ); it != end; it++ ) { + if( endswith( *it, module ) ) { + cout << " Found in file '" << *it << "'" << endl; + moduleFiles.push_back( *it ); + found = true; + } + } + if( !found ) { + cout << " Module '" << module << "' not found" << endl; + } + } + + return moduleFiles; +} + int main( int /* argc */, char *argv[] ) { try { @@ -79,6 +108,8 @@ int main( int /* argc */, char *argv[] ) std::string logLevel = luaVm.getString( "logger.level" ); Logger::instance( ).openConsoleLog( Logger::fromString( logLevel ) ); + int stopAfterNOperations = luaVm.getInt( "crawler.stop_after_N_operations" ); + #ifndef _WIN32 struct sigaction sa; memset( &sa, 0, sizeof( struct sigaction ) ); @@ -91,119 +122,65 @@ int main( int /* argc */, char *argv[] ) SetConsoleCtrlHandler( termHandler, TRUE ); #endif - // go through all type of modules and load them with the proper loader + // go through all type of modules and load them with the proper loader + string modulePath = luaVm.getString( "crawler.module_path" ); bool modulesSearchRecursive = luaVm.getBoolean( "crawler.modules_search_recursive" ); LOG( logNOTICE ) << "Loading modules from path '" << modulePath << "' " << ( modulesSearchRecursive ? "(recursive)" : "" ); - vector entries = directory_entries( modulePath, true, modulesSearchRecursive ); - - vector modules = luaVm.getStringArray( "modules.urlnormalizers" ); - vector normalizerModules; - vector::const_iterator it2, end2 = modules.end( ); - for( it2 = modules.begin( ); it2 != end2; it2++ ) { -#ifndef _WIN32 - string module = *it2 + ".so"; -#else - string module = *it2 + ".dll"; -#endif - cout << "Searching for module '" << module << "'" << endl; - - vector::const_iterator it, end = entries.end( ); - for( it = entries.begin( ); it != end; it++ ) { - if( endswith( *it, module ) ) { - cout << " Found in file '" << *it << "'" << endl; - normalizerModules.push_back( *it ); - } - } - } - ModuleLoader urlNormalizers( normalizerModules, CLOSE_DEFERRED, (void *)&luaVm ); - - // initialize crawler function - luaVm.executeFunction( "init" ); - - // perform a crawl step - luaVm.executeFunction( "crawl" ); - - // cleaning up - luaVm.executeFunction( "destroy" ); + vector allModuleFiles = directory_entries( modulePath, true, modulesSearchRecursive ); + vector modules; - return 0; + modules = luaVm.getStringArray( "modules.urlnormalizers" ); + vector normalizerModules = searchModuleFiles( modules, allModuleFiles ); + ModuleLoader urlNormalizers( normalizerModules, CLOSE_DEFERRED, (void *)&luaVm ); - vector filterModules; -#ifndef _WIN32 - filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); - filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); -#else - filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" ); - filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" ); -#endif + modules = luaVm.getStringArray( "modules.urlfilters" ); + vector filterModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader ) > urlFilters( filterModules ); - vector filterChainModules; -#ifndef _WIN32 - filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); -#else - filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" ); -#endif + modules = luaVm.getStringArray( "modules.urlchainfilters" ); + vector filterChainModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader ) > urlChainFilter( filterChainModules ); - - vector frontierModules; -#ifndef _WIN32 - frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" ); -#else - frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.urlfrontiers" ); + vector frontierModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader frontiers( frontierModules ); - - vector fetcherModules; -#ifndef _WIN32 - fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); - fetcherModules.push_back( "./modules/fetcher/libcurl/mod_fetcher_libcurl.so" ); -#else - fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.fetchers" ); + vector fetcherModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader fetchers( fetcherModules ); - - vector urlseenModules; -#ifndef _WIN32 - urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" ); -#else - urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.urlseens" ); + vector urlseenModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader urlSeens( urlseenModules ); - - vector deduperModules; -#ifndef _WIN32 - deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" ); -#else - deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.dedupers" ); + vector deduperModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader dedupers( deduperModules ); - - vector processorModules; -#ifndef _WIN32 - processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); - processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" ); - processorModules.push_back( "./modules/processor/sitemap/mod_processor_sitemap.so" ); -#else - processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); - processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" ); - processorModules.push_back( ".\\modules\\processor\\sitemap\\mod_processor_sitemap.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.processors" ); + vector processorModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader processors( processorModules ); - - vector typeDetectModules; -#ifndef _WIN32 - typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" ); -#endif + + modules = luaVm.getStringArray( "modules.typedetects" ); + vector typeDetectModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader typeDetectors( typeDetectModules ); + // initialize crawler function + luaVm.executeFunction( "init" ); + + // perform a crawl step + luaVm.executeFunction( "crawl" ); + + // cleaning up + luaVm.executeFunction( "destroy" ); + Frontier *frontier = frontiers.create( "memory_frontier" ); #ifndef _WIN32 -// Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); - Fetcher *fetcher = fetchers.create( "libcurl_fetcher" ); + Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); +// Fetcher *fetcher = fetchers.create( "libcurl_fetcher" ); #else Fetcher *fetcher = fetchers.create( "winhttp_fetcher" ); #endif @@ -219,9 +196,9 @@ int main( int /* argc */, char *argv[] ) URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set hosts; - hosts.insert( "andreasbaumann.dyndns.org" ); +// hosts.insert( "andreasbaumann.dyndns.org" ); // hosts.insert( "www.andreasbaumann.cc" ); -// hosts.insert( "relevancy.bger.ch" ); + hosts.insert( "relevancy.bger.ch" ); // hosts.insert( "wolframe.net" ); URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); @@ -246,9 +223,9 @@ int main( int /* argc */, char *argv[] ) LOG( logNOTICE ) << "Crawler started.."; // frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); -// frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); + frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); // frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) ); - frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { @@ -293,7 +270,7 @@ int main( int /* argc */, char *argv[] ) //~ sleep( 2 ); counter++; - if( counter > 0 ) { + if( counter > stopAfterNOperations ) { term = true; } #else -- cgit v1.2.3-54-g00ecf