diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-07-23 16:44:07 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-07-23 16:44:07 +0200 |
commit | a57788acee59705418b96525410b84fbee2f405a (patch) | |
tree | 660a828a29f0f769638ce18c1b45c62bd602e012 /src/modules | |
parent | ce77513807f47bc7af59c8320932d3348aeb99ea (diff) | |
download | crawler-a57788acee59705418b96525410b84fbee2f405a.tar.gz crawler-a57788acee59705418b96525410b84fbee2f405a.tar.bz2 |
added parsing of Sitemap in robots.txt
Diffstat (limited to 'src/modules')
-rw-r--r-- | src/modules/processor/robotstxt/GNUmakefile | 7 | ||||
-rwxr-xr-x | src/modules/processor/robotstxt/Makefile.W32 | 2 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.cpp | 62 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.hpp | 14 |
4 files changed, 76 insertions, 9 deletions
diff --git a/src/modules/processor/robotstxt/GNUmakefile b/src/modules/processor/robotstxt/GNUmakefile index d52c92e..cbfe420 100644 --- a/src/modules/processor/robotstxt/GNUmakefile +++ b/src/modules/processor/robotstxt/GNUmakefile @@ -9,18 +9,17 @@ INCLUDE_DIRS = \ -I$(TOPDIR)/include/logger \ -I$(TOPDIR)/include/util \ -I$(TOPDIR)/include/module \ - -I$(TOPDIR)/include/crawler \ - -I$(TOPDIR)/streamhtmlparser + -I$(TOPDIR)/include/crawler INCLUDE_CXXFLAGS = \ INCLUDE_LDFLAGS = \ -L$(TOPDIR)/src/libcrawler \ - -L$(TOPDIR)/streamhtmlparser + -L$(TOPDIR)/src/libutil INCLUDE_LIBS = \ -lcrawler \ - -lstreamhtmlparser + -lutil DYNAMIC_MODULE = \ mod_processor_robotstxt.so diff --git a/src/modules/processor/robotstxt/Makefile.W32 b/src/modules/processor/robotstxt/Makefile.W32 index b67513a..ebf1e22 100755 --- a/src/modules/processor/robotstxt/Makefile.W32 +++ b/src/modules/processor/robotstxt/Makefile.W32 @@ -19,7 +19,7 @@ INCLUDE_DIRS = \ INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \ + $(TOPDIR)\src\libutil\util.lib \ $(TOPDIR)\src\libcrawler\crawler.lib \ $(TOPDIR)\src\liblogger\logger.lib diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp index 1b7dbc8..7a91465 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp @@ -1,8 +1,10 @@ #include "RobotsTxtProcessor.hpp" #include "Logger.hpp" +#include "StringUtils.hpp" #include <string> #include <cstring> +#include <algorithm> using namespace std; @@ -14,15 +16,67 @@ RobotsTxtProcessor::~RobotsTxtProcessor( ) { } +void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end ) +{ + while( *it == ' ' && it != end ) it++; +} + +RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end ) +{ + string keyword; + while( *it != ':' && it != end ) { + keyword.push_back( *it ); + it++; + } + if( it == end ) return NoKeyword; + if( *it == ':' ) it++; + + if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword; + if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword; + if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword; + if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword; + return UnknownKeyword; +} + +void RobotsTxtProcessor::handleLine( const string &line ) +{ + string::const_iterator s = line.begin( ); + skipSpaces( s, line.end( ) ); + if( *s == '#' ) return; + KeywordType key = getKeyword( s, line.end( ) ); + switch( key ) { + case UserAgentKeyword: + case DisallowKeyword: + case CrawlDelayKeyword: + break; + + case SitemapKeyword: { + skipSpaces( s, line.end( ) ); + string sitemap = string( s, line.end( ) ); + LOG( logINFO ) << "Found Sitemap '" << sitemap << "'"; + } + break; + + case UnknownKeyword: + LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'"; + break; + + case NoKeyword: + LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'"; + break; + } +} + void RobotsTxtProcessor::process( RewindInputStream *s ) { - char buf[2] = {0, 0}; + string line; + URL url = s->getBaseUrl( ); while( s->good( ) && !s->eof( ) ) { - buf[0] = s->get( ); - if( buf[0] ) { - cout << buf; + getline( *s, line ); + if( s->good( ) ) { + handleLine( line ); } } } diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp index a274f2b..532c741 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp @@ -9,6 +9,20 @@ class RobotsTxtProcessor : public Processor { RobotsTxtProcessor( ); virtual ~RobotsTxtProcessor( ); virtual void process( RewindInputStream *s ); + + typedef enum { + UserAgentKeyword, + DisallowKeyword, + CrawlDelayKeyword, + SitemapKeyword, + UnknownKeyword, + NoKeyword + } KeywordType; + + private: + KeywordType getKeyword( string::const_iterator &it, string::const_iterator end ); + void skipSpaces( string::const_iterator &it, string::const_iterator end ); + void handleLine( const std::string &s ); }; DECLARE_MODULE( Processor ) |