summaryrefslogtreecommitdiff
path: root/src/crawl/crawl.cpp
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-09-06 22:18:23 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-09-06 22:18:23 +0200
commit13fc9a7da5111f4ddba942d3c6b6b8654ce395d6 (patch)
treee86210e3d939911e35f930a6dc73c3ebb591243b /src/crawl/crawl.cpp
parentf5c586f7231f7e033c5528bcefea357e4e64441c (diff)
downloadcrawler-13fc9a7da5111f4ddba942d3c6b6b8654ce395d6.tar.gz
crawler-13fc9a7da5111f4ddba942d3c6b6b8654ce395d6.tar.bz2
more splitting into libcrawl, crawl binary
moved more public header to 'include' changed approach for dynamic linking on Windows
Diffstat (limited to 'src/crawl/crawl.cpp')
-rwxr-xr-xsrc/crawl/crawl.cpp238
1 files changed, 238 insertions, 0 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
new file mode 100755
index 0000000..823ed02
--- /dev/null
+++ b/src/crawl/crawl.cpp
@@ -0,0 +1,238 @@
+#include "Fetcher.hpp"
+#include "Frontier.hpp"
+#include "Deduper.hpp"
+#include "Processor.hpp"
+#include "URLSeen.hpp"
+#include "URLNormalizer.hpp"
+#include "URLFilter.hpp"
+#include "TypeDetect.hpp"
+
+#include "ModuleLoader.hpp"
+
+#include "Logger.hpp"
+
+#include <set>
+#include <vector>
+#include <list>
+
+#ifndef _WIN32
+#include <signal.h>
+#else
+#define WIN32_MEAN_AND_LEAN
+#endif
+
+using namespace std;
+
+static bool term = false;
+
+#ifndef _WIN32
+
+static void terminate_func( int sig )
+{
+ (void)sig;
+ term = true;
+}
+
+#else
+
+BOOL WINAPI termHandler( DWORD ctrlType )
+{
+ switch( ctrlType ){
+ case CTRL_C_EVENT:
+ case CTRL_BREAK_EVENT:
+ case CTRL_CLOSE_EVENT:
+ case CTRL_LOGOFF_EVENT:
+ case CTRL_SHUTDOWN_EVENT:
+ term = true;
+ return TRUE;
+ default:
+ return FALSE;
+ }
+}
+
+#endif
+
+int main( void )
+{
+ try {
+ Logger::instance( ).openConsoleLog( logINFO );
+
+#ifndef _WIN32
+ struct sigaction sa;
+ memset( &sa, 0, sizeof( struct sigaction ) );
+ sa.sa_handler = terminate_func;
+ sa.sa_flags = SA_RESTART;
+ if( sigaction( SIGINT, &sa, NULL ) < 0 ) {
+ cerr << "Unable to install termianation signal handler" << endl;
+ }
+#else
+ SetConsoleCtrlHandler( termHandler, TRUE );
+#endif
+
+ LOG( logNOTICE ) << "Loading modules";
+
+ vector<string> normalizerModules;
+#ifndef _WIN32
+ normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
+ normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
+#else
+ normalizerModules.push_back( ".\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple.dll" );
+ normalizerModules.push_back( ".\\modules\\urlnormalizer\\googleurl\\mod_urlnormalizer_googleurl.dll" );
+#endif
+ ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules );
+
+ vector<string> filterModules;
+#ifndef _WIN32
+ filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" );
+ filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" );
+#else
+ filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" );
+ filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" );
+#endif
+ ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules );
+
+ vector<string> filterChainModules;
+#ifndef _WIN32
+ filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" );
+#else
+ filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" );
+#endif
+ ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules );
+
+ vector<string> frontierModules;
+#ifndef _WIN32
+ frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" );
+#else
+ frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" );
+#endif
+ ModuleLoader<Frontier> frontiers( frontierModules );
+
+ vector<string> fetcherModules;
+#ifndef _WIN32
+ fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" );
+#else
+ fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" );
+#endif
+ ModuleLoader<Fetcher> fetchers( fetcherModules );
+
+ vector<string> urlseenModules;
+#ifndef _WIN32
+ urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" );
+#else
+ urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" );
+#endif
+ ModuleLoader<URLSeen> urlSeens( urlseenModules );
+
+ vector<string> deduperModules;
+#ifndef _WIN32
+ deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" );
+#else
+ deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" );
+#endif
+ ModuleLoader<Deduper> dedupers( deduperModules );
+
+ vector<string> processorModules;
+#ifndef _WIN32
+ processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" );
+#else
+ processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" );
+#endif
+ ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules );
+
+ vector<string> typeDetectModules;
+#ifndef _WIN32
+ typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" );
+#endif
+ ModuleLoader<TypeDetect> typeDetectors( typeDetectModules );
+
+ Frontier *frontier = frontiers.create( "memory_frontier" );
+#ifndef _WIN32
+ Fetcher *fetcher = fetchers.create( "libfetch_fetcher" );
+#else
+ Fetcher *fetcher = fetchers.create( "winhttp_fetcher" );
+#endif
+ Deduper *deduper = dedupers.create( "null_deduper" );
+ URLSeen *urlSeen = urlSeens.create( "memory_urlseen" );
+#ifndef _WIN32
+ TypeDetect *typeDetect = typeDetectors.create( "libmagic_typedetect" );
+#endif
+
+ set<string> protocols;
+ protocols.insert( "http" );
+ protocols.insert( "https" );
+ URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols );
+
+ set<string> hosts;
+ hosts.insert( "www.andreasbaumann.cc" );
+ URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts );
+
+ list<URLFilter *> filters;
+ filters.push_back( hostFilter );
+ filters.push_back( protocolFilter );
+ URLFilter *chainFilter = urlChainFilter.create( "chain_urlfilter", filters );
+
+ URLNormalizer *normalizer = urlNormalizers.create( "google_urlnormalizer" );
+// URLNormalizer *normalizer = urlNormalizers.create( "simple_urlnormalizer" );
+
+ Processor *htmlParser = processors.create( "htmllinkextract_processor",
+ normalizer, frontier, chainFilter, urlSeen );
+
+ LOG( logNOTICE ) << "Crawler started..";
+
+ frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
+
+ URL url;
+ while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) {
+ LOG( logINFO ) << "Got URL " << url;
+ RewindInputStream *s = fetcher->fetch( url );
+ if( !s->good( ) ) {
+ LOG( logERROR ) << "Fetching URL '" << url << "' failed!";
+ continue;
+ }
+
+ if( deduper->contentSeen( url, s ) ) {
+ LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen";
+ delete s;
+ continue;
+ }
+
+#ifndef _WIN32
+ MIMEType mimeType = typeDetect->detect( s );
+
+ if( mimeType != MIMEType::Null ) {
+ if( mimeType == "text/html" ) {
+ s->rewind( );
+ htmlParser->process( s );
+ } else if( mimeType == "application/x-gzip" ) {
+ s->rewind( );
+ LOG( logINFO ) << "Storing archive " << url;
+ }
+ }
+#else
+ htmlParser->process( s );
+#endif
+
+ delete s;
+ }
+
+ processors.destroy( htmlParser );
+ urlNormalizers.destroy( normalizer );
+ urlChainFilter.destroy( chainFilter );
+ urlFilters.destroy( protocolFilter );
+ urlFilters.destroy( hostFilter );
+#ifndef _WIN32
+ typeDetectors.destroy( typeDetect );
+#endif
+ urlSeens.destroy( urlSeen );
+ dedupers.destroy( deduper );
+ fetchers.destroy( fetcher );
+ frontiers.destroy( frontier );
+
+ LOG( logNOTICE ) << "Crawler stopped.. normal shutdown..";
+
+ return 0;
+ } catch( exception &e ) {
+ LOG( logFATAL ) << "Crawler stopped: " << e.what( );
+ return 1;
+ }
+}