diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-09-06 22:18:23 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-09-06 22:18:23 +0200 |
commit | 13fc9a7da5111f4ddba942d3c6b6b8654ce395d6 (patch) | |
tree | e86210e3d939911e35f930a6dc73c3ebb591243b /src/crawl | |
parent | f5c586f7231f7e033c5528bcefea357e4e64441c (diff) | |
download | crawler-13fc9a7da5111f4ddba942d3c6b6b8654ce395d6.tar.gz crawler-13fc9a7da5111f4ddba942d3c6b6b8654ce395d6.tar.bz2 |
more splitting into libcrawl, crawl binary
moved more public header to 'include'
changed approach for dynamic linking on Windows
Diffstat (limited to 'src/crawl')
-rwxr-xr-x | src/crawl/GNUmakefile | 55 | ||||
-rwxr-xr-x | src/crawl/Makefile.W32 | 39 | ||||
-rwxr-xr-x | src/crawl/crawl.cpp | 238 |
3 files changed, 332 insertions, 0 deletions
diff --git a/src/crawl/GNUmakefile b/src/crawl/GNUmakefile new file mode 100755 index 0000000..6899fde --- /dev/null +++ b/src/crawl/GNUmakefile @@ -0,0 +1,55 @@ +TOPDIR = ../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_CPPFLAGS = \ + +INCLUDE_DIRS = \ + -I. \ + -I$(TOPDIR)/include/logger \ + -I$(TOPDIR)/include/util \ + -I$(TOPDIR)/include/module + +INCLUDE_LDFLAGS = \ + -L$(TOPDIR)/src/logger + +INCLUDE_LIBS = \ + -llogger + +# openssl +ifeq ($(WITH_SSL),1) + +INCLUDE_CFLAGS += \ + -DWITH_SSL + +INCLUDE_LIBS += \ + $(OPENSSL_LIBS) +endif + +CPP_OBJS = \ + +CPP_BINS = \ + crawl$(EXE) + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + $(INSTALL) -d -m 0755 $(DESTDIR)$(bindir) + $(INSTALL) -m 0775 crawl$(EXE) $(DESTDIR)$(bindir) + +local_uninstall: + @-rm -f $(DESTDIR)$(bindir)/crawl + @-rmdir $(DESTDIR)$(bindir) + +local_test: + +run: + @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/logger:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser ./crawl diff --git a/src/crawl/Makefile.W32 b/src/crawl/Makefile.W32 new file mode 100755 index 0000000..74442dc --- /dev/null +++ b/src/crawl/Makefile.W32 @@ -0,0 +1,39 @@ +TOPDIR = ..\.. + +SUBDIRS = + +!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk + +INCLUDE_CXXFLAGS = \ + /D_WIN32_WINNT=0x504 \ + /DSHARED + +INCLUDE_DIRS = \ + /I. \ + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\module \ + /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\crawler + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)\src\logger\logger.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib + +CPP_OBJS = \ + +CPP_BINS = \ + crawl.exe + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +crawl.exe: crawl.obj + +local_all: $(CPP_BINS) + +local_clean: + +local_distclean: + +local_test: diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp new file mode 100755 index 0000000..823ed02 --- /dev/null +++ b/src/crawl/crawl.cpp @@ -0,0 +1,238 @@ +#include "Fetcher.hpp" +#include "Frontier.hpp" +#include "Deduper.hpp" +#include "Processor.hpp" +#include "URLSeen.hpp" +#include "URLNormalizer.hpp" +#include "URLFilter.hpp" +#include "TypeDetect.hpp" + +#include "ModuleLoader.hpp" + +#include "Logger.hpp" + +#include <set> +#include <vector> +#include <list> + +#ifndef _WIN32 +#include <signal.h> +#else +#define WIN32_MEAN_AND_LEAN +#endif + +using namespace std; + +static bool term = false; + +#ifndef _WIN32 + +static void terminate_func( int sig ) +{ + (void)sig; + term = true; +} + +#else + +BOOL WINAPI termHandler( DWORD ctrlType ) +{ + switch( ctrlType ){ + case CTRL_C_EVENT: + case CTRL_BREAK_EVENT: + case CTRL_CLOSE_EVENT: + case CTRL_LOGOFF_EVENT: + case CTRL_SHUTDOWN_EVENT: + term = true; + return TRUE; + default: + return FALSE; + } +} + +#endif + +int main( void ) +{ + try { + Logger::instance( ).openConsoleLog( logINFO ); + +#ifndef _WIN32 + struct sigaction sa; + memset( &sa, 0, sizeof( struct sigaction ) ); + sa.sa_handler = terminate_func; + sa.sa_flags = SA_RESTART; + if( sigaction( SIGINT, &sa, NULL ) < 0 ) { + cerr << "Unable to install termianation signal handler" << endl; + } +#else + SetConsoleCtrlHandler( termHandler, TRUE ); +#endif + + LOG( logNOTICE ) << "Loading modules"; + + vector<string> normalizerModules; +#ifndef _WIN32 + normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); + normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); +#else + normalizerModules.push_back( ".\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple.dll" ); + normalizerModules.push_back( ".\\modules\\urlnormalizer\\googleurl\\mod_urlnormalizer_googleurl.dll" ); +#endif + ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules ); + + vector<string> filterModules; +#ifndef _WIN32 + filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); + filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); +#else + filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" ); + filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" ); +#endif + ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules ); + + vector<string> filterChainModules; +#ifndef _WIN32 + filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); +#else + filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" ); +#endif + ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules ); + + vector<string> frontierModules; +#ifndef _WIN32 + frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" ); +#else + frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" ); +#endif + ModuleLoader<Frontier> frontiers( frontierModules ); + + vector<string> fetcherModules; +#ifndef _WIN32 + fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); +#else + fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" ); +#endif + ModuleLoader<Fetcher> fetchers( fetcherModules ); + + vector<string> urlseenModules; +#ifndef _WIN32 + urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" ); +#else + urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" ); +#endif + ModuleLoader<URLSeen> urlSeens( urlseenModules ); + + vector<string> deduperModules; +#ifndef _WIN32 + deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" ); +#else + deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" ); +#endif + ModuleLoader<Deduper> dedupers( deduperModules ); + + vector<string> processorModules; +#ifndef _WIN32 + processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); +#else + processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); +#endif + ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules ); + + vector<string> typeDetectModules; +#ifndef _WIN32 + typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" ); +#endif + ModuleLoader<TypeDetect> typeDetectors( typeDetectModules ); + + Frontier *frontier = frontiers.create( "memory_frontier" ); +#ifndef _WIN32 + Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); +#else + Fetcher *fetcher = fetchers.create( "winhttp_fetcher" ); +#endif + Deduper *deduper = dedupers.create( "null_deduper" ); + URLSeen *urlSeen = urlSeens.create( "memory_urlseen" ); +#ifndef _WIN32 + TypeDetect *typeDetect = typeDetectors.create( "libmagic_typedetect" ); +#endif + + set<string> protocols; + protocols.insert( "http" ); + protocols.insert( "https" ); + URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); + + set<string> hosts; + hosts.insert( "www.andreasbaumann.cc" ); + URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); + + list<URLFilter *> filters; + filters.push_back( hostFilter ); + filters.push_back( protocolFilter ); + URLFilter *chainFilter = urlChainFilter.create( "chain_urlfilter", filters ); + + URLNormalizer *normalizer = urlNormalizers.create( "google_urlnormalizer" ); +// URLNormalizer *normalizer = urlNormalizers.create( "simple_urlnormalizer" ); + + Processor *htmlParser = processors.create( "htmllinkextract_processor", + normalizer, frontier, chainFilter, urlSeen ); + + LOG( logNOTICE ) << "Crawler started.."; + + frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); + + URL url; + while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { + LOG( logINFO ) << "Got URL " << url; + RewindInputStream *s = fetcher->fetch( url ); + if( !s->good( ) ) { + LOG( logERROR ) << "Fetching URL '" << url << "' failed!"; + continue; + } + + if( deduper->contentSeen( url, s ) ) { + LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen"; + delete s; + continue; + } + +#ifndef _WIN32 + MIMEType mimeType = typeDetect->detect( s ); + + if( mimeType != MIMEType::Null ) { + if( mimeType == "text/html" ) { + s->rewind( ); + htmlParser->process( s ); + } else if( mimeType == "application/x-gzip" ) { + s->rewind( ); + LOG( logINFO ) << "Storing archive " << url; + } + } +#else + htmlParser->process( s ); +#endif + + delete s; + } + + processors.destroy( htmlParser ); + urlNormalizers.destroy( normalizer ); + urlChainFilter.destroy( chainFilter ); + urlFilters.destroy( protocolFilter ); + urlFilters.destroy( hostFilter ); +#ifndef _WIN32 + typeDetectors.destroy( typeDetect ); +#endif + urlSeens.destroy( urlSeen ); + dedupers.destroy( deduper ); + fetchers.destroy( fetcher ); + frontiers.destroy( frontier ); + + LOG( logNOTICE ) << "Crawler stopped.. normal shutdown.."; + + return 0; + } catch( exception &e ) { + LOG( logFATAL ) << "Crawler stopped: " << e.what( ); + return 1; + } +} |