diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-04-30 16:46:00 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-04-30 16:46:00 +0200 |
commit | 12c50867c04b2c2a11f5026466bbea02d5406b70 (patch) | |
tree | 4008a8d5e3660d823197f97b3c0b244fa37d3ea1 /src | |
parent | eb3771cafb98451116a4f0ec0e7a371800770de1 (diff) | |
download | crawler-12c50867c04b2c2a11f5026466bbea02d5406b70.tar.gz crawler-12c50867c04b2c2a11f5026466bbea02d5406b70.tar.bz2 |
started a robots.txt parser
Diffstat (limited to 'src')
-rwxr-xr-x | src/crawl/crawl.cpp | 18 | ||||
-rw-r--r-- | src/modules/processor/GNUmakefile | 2 | ||||
-rwxr-xr-x | src/modules/processor/Makefile.W32 | 2 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/GNUmakefile | 47 | ||||
-rwxr-xr-x | src/modules/processor/robotstxt/Makefile.W32 | 54 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.cpp | 30 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.hpp | 16 |
7 files changed, 165 insertions, 4 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index 823ed02..ecc8f16 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -134,8 +134,10 @@ int main( void ) vector<string> processorModules; #ifndef _WIN32 processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); + processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" ); #else processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); + processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" ); #endif ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules ); @@ -163,7 +165,8 @@ int main( void ) URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set<string> hosts; - hosts.insert( "www.andreasbaumann.cc" ); +// hosts.insert( "www.andreasbaumann.cc" ); + hosts.insert( "relevancy.bger.ch" ); URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); list<URLFilter *> filters; @@ -177,9 +180,13 @@ int main( void ) Processor *htmlParser = processors.create( "htmllinkextract_processor", normalizer, frontier, chainFilter, urlSeen ); + Processor *robotsTxtParser = processors.create( "robotstxt_processor", + normalizer, frontier, chainFilter, urlSeen ); + LOG( logNOTICE ) << "Crawler started.."; - frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); +// frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); + frontier->addUrl( normalizer->parseUrl( "http://relevancy.bger.ch/robots.txt" ) ); URL url; while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { @@ -206,6 +213,12 @@ int main( void ) } else if( mimeType == "application/x-gzip" ) { s->rewind( ); LOG( logINFO ) << "Storing archive " << url; + } else if( mimeType == "text/plain" ) { + if( url.path( ) == "/robots.txt" ) { + LOG( logINFO ) << "Checking " << url.path( ); + s->rewind( ); + robotsTxtParser->process( s ); + } } } #else @@ -215,6 +228,7 @@ int main( void ) delete s; } + processors.destroy( robotsTxtParser ); processors.destroy( htmlParser ); urlNormalizers.destroy( normalizer ); urlChainFilter.destroy( chainFilter ); diff --git a/src/modules/processor/GNUmakefile b/src/modules/processor/GNUmakefile index 8bfd814..8b91967 100644 --- a/src/modules/processor/GNUmakefile +++ b/src/modules/processor/GNUmakefile @@ -1,6 +1,6 @@ TOPDIR = ../../.. -SUBDIRS = htmllinkextract +SUBDIRS = htmllinkextract robotstxt -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/modules/processor/Makefile.W32 b/src/modules/processor/Makefile.W32 index f98b918..530fd98 100755 --- a/src/modules/processor/Makefile.W32 +++ b/src/modules/processor/Makefile.W32 @@ -1,6 +1,6 @@ TOPDIR = ..\..\.. -SUBDIRS = htmllinkextract +SUBDIRS = htmllinkextract robotstxt !INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk diff --git a/src/modules/processor/robotstxt/GNUmakefile b/src/modules/processor/robotstxt/GNUmakefile new file mode 100644 index 0000000..d52c92e --- /dev/null +++ b/src/modules/processor/robotstxt/GNUmakefile @@ -0,0 +1,47 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src \ + -I$(TOPDIR)/include/logger \ + -I$(TOPDIR)/include/util \ + -I$(TOPDIR)/include/module \ + -I$(TOPDIR)/include/crawler \ + -I$(TOPDIR)/streamhtmlparser + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + -L$(TOPDIR)/src/libcrawler \ + -L$(TOPDIR)/streamhtmlparser + +INCLUDE_LIBS = \ + -lcrawler \ + -lstreamhtmlparser + +DYNAMIC_MODULE = \ + mod_processor_robotstxt.so + +STATIC_LIB = \ + librobotstxtprocessor.a + +CPP_OBJS = \ + RobotsTxtProcessor.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/modules/processor/robotstxt/Makefile.W32 b/src/modules/processor/robotstxt/Makefile.W32 new file mode 100755 index 0000000..b67513a --- /dev/null +++ b/src/modules/processor/robotstxt/Makefile.W32 @@ -0,0 +1,54 @@ +TOPDIR = ..\..\..\.. + +SUBDIRS = + +!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk + +INCLUDE_CXXFLAGS = \ + /D_WIN32_WINNT=0x504 + +INCLUDE_DIRS = \ + /I. \ + /I$(TOPDIR)\src \ + /I$(TOPDIR)\include\module \ + /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\crawler \ + /I$(TOPDIR)\streamhtmlparser + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib \ + $(TOPDIR)\src\liblogger\logger.lib + +DYNAMIC_MODULE = \ + mod_processor_robotstxt.dll + +STATIC_LIB = \ + robotstxtprocessor.lib + +CPP_OBJS = \ + RobotsTxtProcessor.obj + +SHARED_CPP_OBJS = \ + RobotsTxtProcessor.dllobj + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +$(STATIC_LIB): $(CPP_OBJS) + $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $? + +$(DYNAMIC_MODULE): $(SHARED_CPP_OBJS) + $(LINK) /dll /nologo /out:$@ $(LDFLAGS) $(LIBS) $? + +local_all: $(STATIC_LIB) $(DYNAMIC_MODULE) + +local_clean: + @-erase $(LOCAL_STATIC_LIB) 2>NUL + @-erase $(CPP_OBJS) 2>NUL + +local_distclean: + +local_test: diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp new file mode 100644 index 0000000..1b7dbc8 --- /dev/null +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp @@ -0,0 +1,30 @@ +#include "RobotsTxtProcessor.hpp" +#include "Logger.hpp" + +#include <string> +#include <cstring> + +using namespace std; + +RobotsTxtProcessor::RobotsTxtProcessor( ) +{ +} + +RobotsTxtProcessor::~RobotsTxtProcessor( ) +{ +} + +void RobotsTxtProcessor::process( RewindInputStream *s ) +{ + char buf[2] = {0, 0}; + URL url = s->getBaseUrl( ); + + while( s->good( ) && !s->eof( ) ) { + buf[0] = s->get( ); + if( buf[0] ) { + cout << buf; + } + } +} + +REGISTER_MODULE( "robotstxt_processor", Processor, RobotsTxtProcessor ) diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp new file mode 100644 index 0000000..a274f2b --- /dev/null +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp @@ -0,0 +1,16 @@ +#ifndef __ROBOTS_TXT_PROCESSOR_H +#define __ROBOTS_TXT_PROCESSOR_H + +#include "Processor.hpp" +#include "ModuleRegistry.hpp" + +class RobotsTxtProcessor : public Processor { + public: + RobotsTxtProcessor( ); + virtual ~RobotsTxtProcessor( ); + virtual void process( RewindInputStream *s ); +}; + +DECLARE_MODULE( Processor ) + +#endif |