#include "Fetcher.hpp" #include "Frontier.hpp" #include "Deduper.hpp" #include "Processor.hpp" #include "URLSeen.hpp" #include "URLNormalizer.hpp" #include "URLFilter.hpp" #include "TypeDetect.hpp" #include "ModuleLoader.hpp" #include "Logger.hpp" #include #include #include #ifndef _WIN32 #include #include #else #define WIN32_MEAN_AND_LEAN #endif using namespace std; static bool term = false; #ifndef _WIN32 static void terminate_func( int sig ) { (void)sig; term = true; } #else BOOL WINAPI termHandler( DWORD ctrlType ) { switch( ctrlType ){ case CTRL_C_EVENT: case CTRL_BREAK_EVENT: case CTRL_CLOSE_EVENT: case CTRL_LOGOFF_EVENT: case CTRL_SHUTDOWN_EVENT: term = true; return TRUE; default: return FALSE; } } #endif static int counter = 0; int main( int /* argc */, char *argv[] ) { try { // Logger::instance( ).openConsoleLog( logINFO ); Logger::instance( ).openConsoleLog( logDEBUG ); #ifndef _WIN32 struct sigaction sa; memset( &sa, 0, sizeof( struct sigaction ) ); sa.sa_handler = terminate_func; sa.sa_flags = SA_RESTART; if( sigaction( SIGINT, &sa, NULL ) < 0 ) { cerr << "Unable to install termianation signal handler" << endl; } #else SetConsoleCtrlHandler( termHandler, TRUE ); #endif LOG( logNOTICE ) << "Loading modules"; vector normalizerModules; #ifndef _WIN32 normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); #else normalizerModules.push_back( ".\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple.dll" ); normalizerModules.push_back( ".\\modules\\urlnormalizer\\googleurl\\mod_urlnormalizer_googleurl.dll" ); #endif ModuleLoader urlNormalizers( normalizerModules ); vector filterModules; #ifndef _WIN32 filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); #else filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" ); filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" ); #endif ModuleLoader ) > urlFilters( filterModules ); vector filterChainModules; #ifndef _WIN32 filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); #else filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" ); #endif ModuleLoader ) > urlChainFilter( filterChainModules ); vector frontierModules; #ifndef _WIN32 frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" ); #else frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" ); #endif ModuleLoader frontiers( frontierModules ); vector fetcherModules; #ifndef _WIN32 fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); #else fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" ); #endif ModuleLoader fetchers( fetcherModules ); vector urlseenModules; #ifndef _WIN32 urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" ); #else urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" ); #endif ModuleLoader urlSeens( urlseenModules ); vector deduperModules; #ifndef _WIN32 deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" ); #else deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" ); #endif ModuleLoader dedupers( deduperModules ); vector processorModules; #ifndef _WIN32 processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" ); processorModules.push_back( "./modules/processor/sitemap/mod_processor_sitemap.so" ); #else processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" ); processorModules.push_back( ".\\modules\\processor\\sitemap\\mod_processor_sitemap.dll" ); #endif ModuleLoader processors( processorModules ); vector typeDetectModules; #ifndef _WIN32 typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" ); #endif ModuleLoader typeDetectors( typeDetectModules ); Frontier *frontier = frontiers.create( "memory_frontier" ); #ifndef _WIN32 Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); #else Fetcher *fetcher = fetchers.create( "winhttp_fetcher" ); #endif Deduper *deduper = dedupers.create( "null_deduper" ); URLSeen *urlSeen = urlSeens.create( "memory_urlseen" ); #ifndef _WIN32 TypeDetect *typeDetect = typeDetectors.create( "libmagic_typedetect" ); #endif set protocols; protocols.insert( "http" ); protocols.insert( "https" ); URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set hosts; hosts.insert( "andreasbaumann.dyndns.org" ); // hosts.insert( "www.andreasbaumann.cc" ); // hosts.insert( "relevancy.bger.ch" ); // hosts.insert( "wolframe.net" ); URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); list filters; filters.push_back( hostFilter ); filters.push_back( protocolFilter ); URLFilter *chainFilter = urlChainFilter.create( "chain_urlfilter", filters ); URLNormalizer *normalizer = urlNormalizers.create( "google_urlnormalizer" ); // URLNormalizer *normalizer = urlNormalizers.create( "simple_urlnormalizer" ); Processor *htmlParser = processors.create( "htmllinkextract_processor", normalizer, frontier, chainFilter, urlSeen ); Processor *robotsTxtParser = processors.create( "robotstxt_processor", normalizer, frontier, chainFilter, urlSeen ); Processor *sitemapParser = processors.create( "sitemap_processor", normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; // frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); // frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); // frontier->addUrl( normalizer->parseUrl( "http://wolframe.net" ) ); frontier->addUrl( normalizer->parseUrl( "http://andreasbaumann.dyndns.org/test/" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { LOG( logINFO ) << "Got URL " << url; RewindInputStream *s = fetcher->fetch( url ); if( !s->good( ) ) { LOG( logERROR ) << "Fetching URL '" << url << "' failed!"; continue; } if( deduper->contentSeen( url, s ) ) { LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen"; delete s; continue; } #ifndef _WIN32 MIMEType mimeType = typeDetect->detect( s ); if( mimeType != MIMEType::Null ) { LOG( logDEBUG ) << "MIME type of '" << url << "' is '" << mimeType << "'"; if( mimeType == "text/html" ) { s->rewind( ); htmlParser->process( s ); } else if( mimeType == "application/x-gzip" ) { s->rewind( ); LOG( logINFO ) << "Storing archive " << url; } else if( mimeType == "text/plain" ) { if( url.path( ) == "/robots.txt" ) { LOG( logINFO ) << "Checking " << url.path( ); s->rewind( ); robotsTxtParser->process( s ); } } else if( mimeType == "application/xml" ) { s->rewind( ); sitemapParser->process( s ); } else if( mimeType == "application/pdf" ) { s->rewind( ); } } //~ sleep( 2 ); counter++; if( counter > 10 ) { term = true; } #else htmlParser->process( s ); #endif delete s; } processors.destroy( sitemapParser ); processors.destroy( robotsTxtParser ); processors.destroy( htmlParser ); urlNormalizers.destroy( normalizer ); urlChainFilter.destroy( chainFilter ); urlFilters.destroy( protocolFilter ); urlFilters.destroy( hostFilter ); #ifndef _WIN32 typeDetectors.destroy( typeDetect ); #endif urlSeens.destroy( urlSeen ); dedupers.destroy( deduper ); fetchers.destroy( fetcher ); frontiers.destroy( frontier ); LOG( logNOTICE ) << "Crawler stopped.. normal shutdown.."; return 0; } catch( exception &e ) { LOG( logFATAL ) << "Crawler stopped: " << e.what( ); return 1; } catch( ... ) { LOG( logFATAL ) << "Crawler stopped due to unknown exception!"; return 1; } }