#include "Fetcher.hpp" #include "Frontier.hpp" #include "Deduper.hpp" #include "Processor.hpp" #include "URLSeen.hpp" #include "URLNormalizer.hpp" #include "URLFilter.hpp" #include "TypeDetect.hpp" #include "libcrawler.hpp" #include "ModuleLoader.hpp" #include "Logger.hpp" #include "LuaVM.hpp" #include "StringUtils.hpp" #include "FileUtils.hpp" #include #include #include #ifndef _WIN32 #include #include #else #define WIN32_MEAN_AND_LEAN #endif using namespace std; static bool term = false; #ifndef _WIN32 static void terminate_func( int sig ) { (void)sig; term = true; } #else BOOL WINAPI termHandler( DWORD ctrlType ) { switch( ctrlType ){ case CTRL_C_EVENT: case CTRL_BREAK_EVENT: case CTRL_CLOSE_EVENT: case CTRL_LOGOFF_EVENT: case CTRL_SHUTDOWN_EVENT: term = true; return TRUE; default: return FALSE; } } #endif static int counter = 0; static vector searchModuleFiles( const vector &modules, const vector &entries ) { vector moduleFiles; vector::const_iterator it2, end2 = modules.end( ); for( it2 = modules.begin( ); it2 != end2; it2++ ) { #ifndef _WIN32 string module = *it2 + ".so"; #else string module = *it2 + ".dll"; #endif cout << "Searching for module '" << module << "'" << endl; bool found = false; vector::const_iterator it, end = entries.end( ); for( it = entries.begin( ); it != end; it++ ) { if( endswith( *it, module ) ) { cout << " Found in file '" << *it << "'" << endl; moduleFiles.push_back( *it ); found = true; } } if( !found ) { cout << " Module '" << module << "' not found" << endl; } } return moduleFiles; } static int lua_log_level( const LogLevel logLevel, lua_State *l ) { size_t nofParams = lua_gettop( l ); if( nofParams == 0 ) return 0; ostringstream ss; for( size_t i = 1; i <= nofParams; i++ ) { int type = lua_type( l, i ); switch( type ) { case LUA_TNIL: ss << ""; break; case LUA_TSTRING: ss << lua_tostring( l, i ); break; case LUA_TNUMBER: ss << lua_tonumber( l, i ); break; default: ss << ""; break; } if( i != nofParams ) { ss << " "; } } LOG( logLevel ) << ss.str( ); lua_pop( l, (int)nofParams ); return 0; } #define lua_log( level ) \ static int lua_log_ ## level( lua_State *l ) \ { \ return lua_log_level( log ## level, l ); \ } lua_log( FATAL ) lua_log( CRITICAL ) lua_log( ERROR ) lua_log( WARNING ) lua_log( NOTICE ) lua_log( INFO ) lua_log( DEBUG ) lua_log( DEBUG1 ) lua_log( DEBUG2 ) lua_log( DEBUG3 ) lua_log( DEBUG4 ) int main( int /* argc */, char *argv[] ) { try { LuaVM luaVm; initialize_libcrawler( (void *)&luaVm ); //Logger::instance( ).openConsoleLog( logDEBUG ); // load configuration (Lua) luaVm.loadSource( argv[1] ); // register logging function luaL_Reg reg[12] = { { "fatal", lua_log_FATAL }, { "critical", lua_log_CRITICAL }, { "error", lua_log_ERROR }, { "warning", lua_log_WARNING }, { "notice", lua_log_NOTICE }, { "info", lua_log_INFO }, { "debug", lua_log_DEBUG }, { "debug1", lua_log_DEBUG1 }, { "debug2", lua_log_DEBUG2 }, { "debug3", lua_log_DEBUG3 }, { "debug4", lua_log_DEBUG4 }, { NULL, NULL } }; lua_newtable( luaVm.handle( ) ); luaL_setfuncs( luaVm.handle( ), reg, 0 ); lua_setglobal( luaVm.handle( ), "log" ); // execute main (to get basic configuration in form // of global variables) luaVm.executeMain( ); std::string logLevel = luaVm.getString( "logger.level" ); Logger::instance( ).openConsoleLog( Logger::fromString( logLevel ) ); int stopAfterNOperations = luaVm.getInt( "crawler.stop_after_N_operations" ); #ifndef _WIN32 struct sigaction sa; memset( &sa, 0, sizeof( struct sigaction ) ); sa.sa_handler = terminate_func; sa.sa_flags = SA_RESTART; if( sigaction( SIGINT, &sa, NULL ) < 0 ) { cerr << "Unable to install termianation signal handler" << endl; } #else SetConsoleCtrlHandler( termHandler, TRUE ); #endif // go through all type of modules and load them with the proper loader string modulePath = luaVm.getString( "crawler.module_path" ); bool modulesSearchRecursive = luaVm.getBoolean( "crawler.modules_search_recursive" ); LOG( logNOTICE ) << "Loading modules from path '" << modulePath << "' " << ( modulesSearchRecursive ? "(recursive)" : "" ); vector allModuleFiles = directory_entries( modulePath, true, modulesSearchRecursive ); vector modules; modules = luaVm.getStringArray( "modules.urlnormalizers" ); vector normalizerModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader urlNormalizers( normalizerModules, CLOSE_DEFERRED, (void *)&luaVm ); lua_newtable( luaVm.handle( ) ); lua_pushstring( luaVm.handle( ), "create" ); lua_pushlightuserdata( luaVm.handle( ), &urlNormalizers ); #ifdef WITH_LUA lua_pushcclosure( luaVm.handle( ), (lua_CFunction)urlNormalizers.luaCreateFunc, 1 ); #endif lua_settable( luaVm.handle( ), -3 ); lua_setglobal( luaVm.handle( ), "urlnormalizers" ); modules = luaVm.getStringArray( "modules.urlfilters" ); vector filterModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader ) > urlFilters( filterModules, CLOSE_DEFERRED, (void *)&luaVm ); modules = luaVm.getStringArray( "modules.urlchainfilters" ); vector filterChainModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader ) > urlChainFilter( filterChainModules, CLOSE_DEFERRED, (void *)&luaVm ); modules = luaVm.getStringArray( "modules.urlfrontiers" ); vector frontierModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader frontiers( frontierModules, CLOSE_DEFERRED, (void *)&luaVm ); modules = luaVm.getStringArray( "modules.fetchers" ); vector fetcherModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader fetchers( fetcherModules, CLOSE_DEFERRED, (void *)&luaVm ); modules = luaVm.getStringArray( "modules.urlseens" ); vector urlseenModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader urlSeens( urlseenModules, CLOSE_DEFERRED, (void *)&luaVm ); modules = luaVm.getStringArray( "modules.dedupers" ); vector deduperModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader dedupers( deduperModules, CLOSE_DEFERRED, (void *)&luaVm ); modules = luaVm.getStringArray( "modules.processors" ); vector processorModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader processors( processorModules, CLOSE_DEFERRED, (void *)&luaVm ); modules = luaVm.getStringArray( "modules.typedetects" ); vector typeDetectModules = searchModuleFiles( modules, allModuleFiles ); ModuleLoader typeDetectors( typeDetectModules, CLOSE_DEFERRED, (void *)&luaVm ); // initialize crawler function luaVm.executeFunction( "init" ); // perform a crawl step luaVm.executeFunction( "crawl" ); // cleaning up luaVm.executeFunction( "destroy" ); Frontier *frontier = frontiers.create( "memory_frontier" ); #ifndef _WIN32 Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); // Fetcher *fetcher = fetchers.create( "libcurl_fetcher" ); #else Fetcher *fetcher = fetchers.create( "winhttp_fetcher" ); #endif Deduper *deduper = dedupers.create( "null_deduper" ); URLSeen *urlSeen = urlSeens.create( "memory_urlseen" ); #ifndef _WIN32 TypeDetect *typeDetect = typeDetectors.create( "libmagic_typedetect" ); #endif set protocols; protocols.insert( "http" ); protocols.insert( "https" ); URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set hosts; // hosts.insert( "andreasbaumann.dyndns.org" ); // hosts.insert( "www.andreasbaumann.cc" ); hosts.insert( "relevancy.bger.ch" ); // hosts.insert( "wolframe.net" ); URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); list filters; filters.push_back( hostFilter ); filters.push_back( protocolFilter ); URLFilter *chainFilter = urlChainFilter.create( "chain_urlfilter", filters ); URLNormalizer *normalizer = urlNormalizers.create( "google_urlnormalizer" ); // URLNormalizer *normalizer = urlNormalizers.create( "simple_urlnormalizer" ); Processor *htmlParser = processors.create( "htmllinkextract_processor", normalizer, frontier, chainFilter, urlSeen ); Processor *robotsTxtParser = processors.create( "robotstxt_processor", normalizer, frontier, chainFilter, urlSeen ); Processor *sitemapParser = processors.create( "sitemap_processor", normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; // frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); // frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) ); // frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { LOG( logINFO ) << "Got URL " << url; RewindInputStream *s = fetcher->fetch( url ); if( !s->good( ) ) { LOG( logERROR ) << "Fetching URL '" << url << "' failed!"; continue; } if( deduper->contentSeen( url, s ) ) { LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen"; delete s; continue; } #ifndef _WIN32 MIMEType mimeType = typeDetect->detect( s ); if( mimeType != MIMEType::Null ) { LOG( logDEBUG ) << "MIME type of '" << url << "' is '" << mimeType << "'"; if( mimeType == "text/html" ) { s->rewind( ); htmlParser->process( s ); } else if( mimeType == "application/x-gzip" ) { s->rewind( ); LOG( logINFO ) << "Storing archive " << url; } else if( mimeType == "text/plain" ) { if( url.path( ) == "/robots.txt" ) { LOG( logINFO ) << "Checking " << url.path( ); s->rewind( ); robotsTxtParser->process( s ); } } else if( mimeType == "application/xml" ) { s->rewind( ); sitemapParser->process( s ); } else if( mimeType == "application/pdf" ) { s->rewind( ); } } sleep( 1 ); counter++; if( counter > stopAfterNOperations ) { term = true; } #else htmlParser->process( s ); #endif delete s; } processors.destroy( sitemapParser ); processors.destroy( robotsTxtParser ); processors.destroy( htmlParser ); urlNormalizers.destroy( normalizer ); urlChainFilter.destroy( chainFilter ); urlFilters.destroy( protocolFilter ); urlFilters.destroy( hostFilter ); #ifndef _WIN32 typeDetectors.destroy( typeDetect ); #endif urlSeens.destroy( urlSeen ); dedupers.destroy( deduper ); fetchers.destroy( fetcher ); frontiers.destroy( frontier ); LOG( logNOTICE ) << "Crawler stopped.. normal shutdown.."; } catch( exception &e ) { LOG( logFATAL ) << "Crawler stopped: " << e.what( ); return 1; } catch( ... ) { LOG( logFATAL ) << "Crawler stopped due to unknown exception!"; return 1; } }