diff options
Diffstat (limited to 'src/modules/processor/robotstxt/RobotsTxtProcessor.cpp')
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.cpp | 62 |
1 files changed, 58 insertions, 4 deletions
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp index 1b7dbc8..7a91465 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp @@ -1,8 +1,10 @@ #include "RobotsTxtProcessor.hpp" #include "Logger.hpp" +#include "StringUtils.hpp" #include <string> #include <cstring> +#include <algorithm> using namespace std; @@ -14,15 +16,67 @@ RobotsTxtProcessor::~RobotsTxtProcessor( ) { } +void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end ) +{ + while( *it == ' ' && it != end ) it++; +} + +RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end ) +{ + string keyword; + while( *it != ':' && it != end ) { + keyword.push_back( *it ); + it++; + } + if( it == end ) return NoKeyword; + if( *it == ':' ) it++; + + if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword; + if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword; + if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword; + if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword; + return UnknownKeyword; +} + +void RobotsTxtProcessor::handleLine( const string &line ) +{ + string::const_iterator s = line.begin( ); + skipSpaces( s, line.end( ) ); + if( *s == '#' ) return; + KeywordType key = getKeyword( s, line.end( ) ); + switch( key ) { + case UserAgentKeyword: + case DisallowKeyword: + case CrawlDelayKeyword: + break; + + case SitemapKeyword: { + skipSpaces( s, line.end( ) ); + string sitemap = string( s, line.end( ) ); + LOG( logINFO ) << "Found Sitemap '" << sitemap << "'"; + } + break; + + case UnknownKeyword: + LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'"; + break; + + case NoKeyword: + LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'"; + break; + } +} + void RobotsTxtProcessor::process( RewindInputStream *s ) { - char buf[2] = {0, 0}; + string line; + URL url = s->getBaseUrl( ); while( s->good( ) && !s->eof( ) ) { - buf[0] = s->get( ); - if( buf[0] ) { - cout << buf; + getline( *s, line ); + if( s->good( ) ) { + handleLine( line ); } } } |