diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2014-10-16 13:29:01 +0200 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2014-10-16 13:29:01 +0200 |
commit | 971d5d22e7117acb95c7903dd5b911b96fc97dcf (patch) | |
tree | 079fb0e064e1c4a35dbd27821e993b573d268ba2 /src/crawl | |
parent | ff403df10813717698dc47e0b22f19d62c007cff (diff) | |
download | crawler-971d5d22e7117acb95c7903dd5b911b96fc97dcf.tar.gz crawler-971d5d22e7117acb95c7903dd5b911b96fc97dcf.tar.bz2 |
creating all module constructors now from Lua configuration
Diffstat (limited to 'src/crawl')
-rw-r--r-- | src/crawl/crawl.conf | 38 | ||||
-rwxr-xr-x | src/crawl/crawl.cpp | 175 |
2 files changed, 114 insertions, 99 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf index bfcd07b..fd9776f 100644 --- a/src/crawl/crawl.conf +++ b/src/crawl/crawl.conf @@ -17,7 +17,45 @@ modules = { urlnormalizers = { "mod_urlnormalizer_simple", "mod_urlnormalizer_googleurl" + }, + + urlfilters = { + "mod_urlfilter_host", + "mod_urlfilter_protocol" + }, + + urlchainfilters = { + "mod_urlfilter_chain" + }, + + urlfrontiers = { + "mod_frontier_memory" + }, + + fetchers = { + "mod_fetcher_libfetch", + "mod_fetcher_libcurl", + "mod_fetcher_winhttp" + }, + + urlseens = { + "mod_urlseen_memory" + }, + + dedupers = { + "mod_deduper_null" + }, + + processors = { + "mod_processor_htmllinkextract", + "mod_processor_robotstxt", + "mod_processor_sitemap" + }, + + typedetects = { + "mod_typedetect_libmagic" } + } -- seeds: URLS which are fed in the beginning to the URL frontier diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index 7d6622e..77b6876 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -61,6 +61,35 @@ BOOL WINAPI termHandler( DWORD ctrlType ) static int counter = 0; +static vector<string> searchModuleFiles( const vector<string> &modules, const vector<string> &entries ) +{ + vector<string> moduleFiles; + vector<string>::const_iterator it2, end2 = modules.end( ); + for( it2 = modules.begin( ); it2 != end2; it2++ ) { +#ifndef _WIN32 + string module = *it2 + ".so"; +#else + string module = *it2 + ".dll"; +#endif + cout << "Searching for module '" << module << "'" << endl; + + bool found = false; + vector<string>::const_iterator it, end = entries.end( ); + for( it = entries.begin( ); it != end; it++ ) { + if( endswith( *it, module ) ) { + cout << " Found in file '" << *it << "'" << endl; + moduleFiles.push_back( *it ); + found = true; + } + } + if( !found ) { + cout << " Module '" << module << "' not found" << endl; + } + } + + return moduleFiles; +} + int main( int /* argc */, char *argv[] ) { try { @@ -79,6 +108,8 @@ int main( int /* argc */, char *argv[] ) std::string logLevel = luaVm.getString( "logger.level" ); Logger::instance( ).openConsoleLog( Logger::fromString( logLevel ) ); + int stopAfterNOperations = luaVm.getInt( "crawler.stop_after_N_operations" ); + #ifndef _WIN32 struct sigaction sa; memset( &sa, 0, sizeof( struct sigaction ) ); @@ -91,119 +122,65 @@ int main( int /* argc */, char *argv[] ) SetConsoleCtrlHandler( termHandler, TRUE ); #endif - // go through all type of modules and load them with the proper loader + // go through all type of modules and load them with the proper loader + string modulePath = luaVm.getString( "crawler.module_path" ); bool modulesSearchRecursive = luaVm.getBoolean( "crawler.modules_search_recursive" ); LOG( logNOTICE ) << "Loading modules from path '" << modulePath << "' " << ( modulesSearchRecursive ? "(recursive)" : "" ); - vector<string> entries = directory_entries( modulePath, true, modulesSearchRecursive ); - - vector<string> modules = luaVm.getStringArray( "modules.urlnormalizers" ); - vector<string> normalizerModules; - vector<string>::const_iterator it2, end2 = modules.end( ); - for( it2 = modules.begin( ); it2 != end2; it2++ ) { -#ifndef _WIN32 - string module = *it2 + ".so"; -#else - string module = *it2 + ".dll"; -#endif - cout << "Searching for module '" << module << "'" << endl; - - vector<string>::const_iterator it, end = entries.end( ); - for( it = entries.begin( ); it != end; it++ ) { - if( endswith( *it, module ) ) { - cout << " Found in file '" << *it << "'" << endl; - normalizerModules.push_back( *it ); - } - } - } - ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules, CLOSE_DEFERRED, (void *)&luaVm ); - - // initialize crawler function - luaVm.executeFunction( "init" ); - - // perform a crawl step - luaVm.executeFunction( "crawl" ); - - // cleaning up - luaVm.executeFunction( "destroy" ); + vector<string> allModuleFiles = directory_entries( modulePath, true, modulesSearchRecursive ); + vector<string> modules; - return 0; + modules = luaVm.getStringArray( "modules.urlnormalizers" ); + vector<string> normalizerModules = searchModuleFiles( modules, allModuleFiles ); + ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules, CLOSE_DEFERRED, (void *)&luaVm ); - vector<string> filterModules; -#ifndef _WIN32 - filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); - filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); -#else - filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" ); - filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" ); -#endif + modules = luaVm.getStringArray( "modules.urlfilters" ); + vector<string> filterModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules ); - vector<string> filterChainModules; -#ifndef _WIN32 - filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); -#else - filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" ); -#endif + modules = luaVm.getStringArray( "modules.urlchainfilters" ); + vector<string> filterChainModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules ); - - vector<string> frontierModules; -#ifndef _WIN32 - frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" ); -#else - frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.urlfrontiers" ); + vector<string> frontierModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader<Frontier> frontiers( frontierModules ); - - vector<string> fetcherModules; -#ifndef _WIN32 - fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); - fetcherModules.push_back( "./modules/fetcher/libcurl/mod_fetcher_libcurl.so" ); -#else - fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.fetchers" ); + vector<string> fetcherModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader<Fetcher> fetchers( fetcherModules ); - - vector<string> urlseenModules; -#ifndef _WIN32 - urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" ); -#else - urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.urlseens" ); + vector<string> urlseenModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader<URLSeen> urlSeens( urlseenModules ); - - vector<string> deduperModules; -#ifndef _WIN32 - deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" ); -#else - deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.dedupers" ); + vector<string> deduperModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader<Deduper> dedupers( deduperModules ); - - vector<string> processorModules; -#ifndef _WIN32 - processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); - processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" ); - processorModules.push_back( "./modules/processor/sitemap/mod_processor_sitemap.so" ); -#else - processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); - processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" ); - processorModules.push_back( ".\\modules\\processor\\sitemap\\mod_processor_sitemap.dll" ); -#endif + + modules = luaVm.getStringArray( "modules.processors" ); + vector<string> processorModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules ); - - vector<string> typeDetectModules; -#ifndef _WIN32 - typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" ); -#endif + + modules = luaVm.getStringArray( "modules.typedetects" ); + vector<string> typeDetectModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader<TypeDetect> typeDetectors( typeDetectModules ); + // initialize crawler function + luaVm.executeFunction( "init" ); + + // perform a crawl step + luaVm.executeFunction( "crawl" ); + + // cleaning up + luaVm.executeFunction( "destroy" ); + Frontier *frontier = frontiers.create( "memory_frontier" ); #ifndef _WIN32 -// Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); - Fetcher *fetcher = fetchers.create( "libcurl_fetcher" ); + Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); +// Fetcher *fetcher = fetchers.create( "libcurl_fetcher" ); #else Fetcher *fetcher = fetchers.create( "winhttp_fetcher" ); #endif @@ -219,9 +196,9 @@ int main( int /* argc */, char *argv[] ) URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set<string> hosts; - hosts.insert( "andreasbaumann.dyndns.org" ); +// hosts.insert( "andreasbaumann.dyndns.org" ); // hosts.insert( "www.andreasbaumann.cc" ); -// hosts.insert( "relevancy.bger.ch" ); + hosts.insert( "relevancy.bger.ch" ); // hosts.insert( "wolframe.net" ); URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); @@ -246,9 +223,9 @@ int main( int /* argc */, char *argv[] ) LOG( logNOTICE ) << "Crawler started.."; // frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); -// frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); + frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); // frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) ); - frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { @@ -293,7 +270,7 @@ int main( int /* argc */, char *argv[] ) //~ sleep( 2 ); counter++; - if( counter > 0 ) { + if( counter > stopAfterNOperations ) { term = true; } #else |