summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-10-16 13:29:01 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-10-16 13:29:01 +0200
commit971d5d22e7117acb95c7903dd5b911b96fc97dcf (patch)
tree079fb0e064e1c4a35dbd27821e993b573d268ba2
parentff403df10813717698dc47e0b22f19d62c007cff (diff)
downloadcrawler-971d5d22e7117acb95c7903dd5b911b96fc97dcf.tar.gz
crawler-971d5d22e7117acb95c7903dd5b911b96fc97dcf.tar.bz2
creating all module constructors now from Lua configuration
-rw-r--r--src/crawl/crawl.conf38
-rwxr-xr-xsrc/crawl/crawl.cpp175
2 files changed, 114 insertions, 99 deletions
diff --git a/src/crawl/crawl.conf b/src/crawl/crawl.conf
index bfcd07b..fd9776f 100644
--- a/src/crawl/crawl.conf
+++ b/src/crawl/crawl.conf
@@ -17,7 +17,45 @@ modules = {
urlnormalizers = {
"mod_urlnormalizer_simple",
"mod_urlnormalizer_googleurl"
+ },
+
+ urlfilters = {
+ "mod_urlfilter_host",
+ "mod_urlfilter_protocol"
+ },
+
+ urlchainfilters = {
+ "mod_urlfilter_chain"
+ },
+
+ urlfrontiers = {
+ "mod_frontier_memory"
+ },
+
+ fetchers = {
+ "mod_fetcher_libfetch",
+ "mod_fetcher_libcurl",
+ "mod_fetcher_winhttp"
+ },
+
+ urlseens = {
+ "mod_urlseen_memory"
+ },
+
+ dedupers = {
+ "mod_deduper_null"
+ },
+
+ processors = {
+ "mod_processor_htmllinkextract",
+ "mod_processor_robotstxt",
+ "mod_processor_sitemap"
+ },
+
+ typedetects = {
+ "mod_typedetect_libmagic"
}
+
}
-- seeds: URLS which are fed in the beginning to the URL frontier
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index 7d6622e..77b6876 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -61,6 +61,35 @@ BOOL WINAPI termHandler( DWORD ctrlType )
static int counter = 0;
+static vector<string> searchModuleFiles( const vector<string> &modules, const vector<string> &entries )
+{
+ vector<string> moduleFiles;
+ vector<string>::const_iterator it2, end2 = modules.end( );
+ for( it2 = modules.begin( ); it2 != end2; it2++ ) {
+#ifndef _WIN32
+ string module = *it2 + ".so";
+#else
+ string module = *it2 + ".dll";
+#endif
+ cout << "Searching for module '" << module << "'" << endl;
+
+ bool found = false;
+ vector<string>::const_iterator it, end = entries.end( );
+ for( it = entries.begin( ); it != end; it++ ) {
+ if( endswith( *it, module ) ) {
+ cout << " Found in file '" << *it << "'" << endl;
+ moduleFiles.push_back( *it );
+ found = true;
+ }
+ }
+ if( !found ) {
+ cout << " Module '" << module << "' not found" << endl;
+ }
+ }
+
+ return moduleFiles;
+}
+
int main( int /* argc */, char *argv[] )
{
try {
@@ -79,6 +108,8 @@ int main( int /* argc */, char *argv[] )
std::string logLevel = luaVm.getString( "logger.level" );
Logger::instance( ).openConsoleLog( Logger::fromString( logLevel ) );
+ int stopAfterNOperations = luaVm.getInt( "crawler.stop_after_N_operations" );
+
#ifndef _WIN32
struct sigaction sa;
memset( &sa, 0, sizeof( struct sigaction ) );
@@ -91,119 +122,65 @@ int main( int /* argc */, char *argv[] )
SetConsoleCtrlHandler( termHandler, TRUE );
#endif
- // go through all type of modules and load them with the proper loader
+ // go through all type of modules and load them with the proper loader
+
string modulePath = luaVm.getString( "crawler.module_path" );
bool modulesSearchRecursive = luaVm.getBoolean( "crawler.modules_search_recursive" );
LOG( logNOTICE ) << "Loading modules from path '" << modulePath << "' "
<< ( modulesSearchRecursive ? "(recursive)" : "" );
- vector<string> entries = directory_entries( modulePath, true, modulesSearchRecursive );
-
- vector<string> modules = luaVm.getStringArray( "modules.urlnormalizers" );
- vector<string> normalizerModules;
- vector<string>::const_iterator it2, end2 = modules.end( );
- for( it2 = modules.begin( ); it2 != end2; it2++ ) {
-#ifndef _WIN32
- string module = *it2 + ".so";
-#else
- string module = *it2 + ".dll";
-#endif
- cout << "Searching for module '" << module << "'" << endl;
-
- vector<string>::const_iterator it, end = entries.end( );
- for( it = entries.begin( ); it != end; it++ ) {
- if( endswith( *it, module ) ) {
- cout << " Found in file '" << *it << "'" << endl;
- normalizerModules.push_back( *it );
- }
- }
- }
- ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules, CLOSE_DEFERRED, (void *)&luaVm );
-
- // initialize crawler function
- luaVm.executeFunction( "init" );
-
- // perform a crawl step
- luaVm.executeFunction( "crawl" );
-
- // cleaning up
- luaVm.executeFunction( "destroy" );
+ vector<string> allModuleFiles = directory_entries( modulePath, true, modulesSearchRecursive );
+ vector<string> modules;
- return 0;
+ modules = luaVm.getStringArray( "modules.urlnormalizers" );
+ vector<string> normalizerModules = searchModuleFiles( modules, allModuleFiles );
+ ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules, CLOSE_DEFERRED, (void *)&luaVm );
- vector<string> filterModules;
-#ifndef _WIN32
- filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" );
- filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" );
-#else
- filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" );
- filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" );
-#endif
+ modules = luaVm.getStringArray( "modules.urlfilters" );
+ vector<string> filterModules = searchModuleFiles( modules, allModuleFiles );
ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules );
- vector<string> filterChainModules;
-#ifndef _WIN32
- filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" );
-#else
- filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" );
-#endif
+ modules = luaVm.getStringArray( "modules.urlchainfilters" );
+ vector<string> filterChainModules = searchModuleFiles( modules, allModuleFiles );
ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules );
-
- vector<string> frontierModules;
-#ifndef _WIN32
- frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" );
-#else
- frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" );
-#endif
+
+ modules = luaVm.getStringArray( "modules.urlfrontiers" );
+ vector<string> frontierModules = searchModuleFiles( modules, allModuleFiles );
ModuleLoader<Frontier> frontiers( frontierModules );
-
- vector<string> fetcherModules;
-#ifndef _WIN32
- fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" );
- fetcherModules.push_back( "./modules/fetcher/libcurl/mod_fetcher_libcurl.so" );
-#else
- fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" );
-#endif
+
+ modules = luaVm.getStringArray( "modules.fetchers" );
+ vector<string> fetcherModules = searchModuleFiles( modules, allModuleFiles );
ModuleLoader<Fetcher> fetchers( fetcherModules );
-
- vector<string> urlseenModules;
-#ifndef _WIN32
- urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" );
-#else
- urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" );
-#endif
+
+ modules = luaVm.getStringArray( "modules.urlseens" );
+ vector<string> urlseenModules = searchModuleFiles( modules, allModuleFiles );
ModuleLoader<URLSeen> urlSeens( urlseenModules );
-
- vector<string> deduperModules;
-#ifndef _WIN32
- deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" );
-#else
- deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" );
-#endif
+
+ modules = luaVm.getStringArray( "modules.dedupers" );
+ vector<string> deduperModules = searchModuleFiles( modules, allModuleFiles );
ModuleLoader<Deduper> dedupers( deduperModules );
-
- vector<string> processorModules;
-#ifndef _WIN32
- processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" );
- processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" );
- processorModules.push_back( "./modules/processor/sitemap/mod_processor_sitemap.so" );
-#else
- processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" );
- processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" );
- processorModules.push_back( ".\\modules\\processor\\sitemap\\mod_processor_sitemap.dll" );
-#endif
+
+ modules = luaVm.getStringArray( "modules.processors" );
+ vector<string> processorModules = searchModuleFiles( modules, allModuleFiles );
ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules );
-
- vector<string> typeDetectModules;
-#ifndef _WIN32
- typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" );
-#endif
+
+ modules = luaVm.getStringArray( "modules.typedetects" );
+ vector<string> typeDetectModules = searchModuleFiles( modules, allModuleFiles );
ModuleLoader<TypeDetect> typeDetectors( typeDetectModules );
+ // initialize crawler function
+ luaVm.executeFunction( "init" );
+
+ // perform a crawl step
+ luaVm.executeFunction( "crawl" );
+
+ // cleaning up
+ luaVm.executeFunction( "destroy" );
+
Frontier *frontier = frontiers.create( "memory_frontier" );
#ifndef _WIN32
-// Fetcher *fetcher = fetchers.create( "libfetch_fetcher" );
- Fetcher *fetcher = fetchers.create( "libcurl_fetcher" );
+ Fetcher *fetcher = fetchers.create( "libfetch_fetcher" );
+// Fetcher *fetcher = fetchers.create( "libcurl_fetcher" );
#else
Fetcher *fetcher = fetchers.create( "winhttp_fetcher" );
#endif
@@ -219,9 +196,9 @@ int main( int /* argc */, char *argv[] )
URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols );
set<string> hosts;
- hosts.insert( "andreasbaumann.dyndns.org" );
+// hosts.insert( "andreasbaumann.dyndns.org" );
// hosts.insert( "www.andreasbaumann.cc" );
-// hosts.insert( "relevancy.bger.ch" );
+ hosts.insert( "relevancy.bger.ch" );
// hosts.insert( "wolframe.net" );
URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts );
@@ -246,9 +223,9 @@ int main( int /* argc */, char *argv[] )
LOG( logNOTICE ) << "Crawler started..";
// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
-// frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) );
+ frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) );
// frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) );
- frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) );
+// frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) );
URL url;
while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) {
@@ -293,7 +270,7 @@ int main( int /* argc */, char *argv[] )
//~ sleep( 2 );
counter++;
- if( counter > 0 ) {
+ if( counter > stopAfterNOperations ) {
term = true;
}
#else