summaryrefslogtreecommitdiff
path: root/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/modules/processor/robotstxt/RobotsTxtProcessor.cpp')
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.cpp62
1 files changed, 58 insertions, 4 deletions
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
index 1b7dbc8..7a91465 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
@@ -1,8 +1,10 @@
#include "RobotsTxtProcessor.hpp"
#include "Logger.hpp"
+#include "StringUtils.hpp"
#include <string>
#include <cstring>
+#include <algorithm>
using namespace std;
@@ -14,15 +16,67 @@ RobotsTxtProcessor::~RobotsTxtProcessor( )
{
}
+void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end )
+{
+ while( *it == ' ' && it != end ) it++;
+}
+
+RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end )
+{
+ string keyword;
+ while( *it != ':' && it != end ) {
+ keyword.push_back( *it );
+ it++;
+ }
+ if( it == end ) return NoKeyword;
+ if( *it == ':' ) it++;
+
+ if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword;
+ if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword;
+ if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword;
+ if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword;
+ return UnknownKeyword;
+}
+
+void RobotsTxtProcessor::handleLine( const string &line )
+{
+ string::const_iterator s = line.begin( );
+ skipSpaces( s, line.end( ) );
+ if( *s == '#' ) return;
+ KeywordType key = getKeyword( s, line.end( ) );
+ switch( key ) {
+ case UserAgentKeyword:
+ case DisallowKeyword:
+ case CrawlDelayKeyword:
+ break;
+
+ case SitemapKeyword: {
+ skipSpaces( s, line.end( ) );
+ string sitemap = string( s, line.end( ) );
+ LOG( logINFO ) << "Found Sitemap '" << sitemap << "'";
+ }
+ break;
+
+ case UnknownKeyword:
+ LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'";
+ break;
+
+ case NoKeyword:
+ LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'";
+ break;
+ }
+}
+
void RobotsTxtProcessor::process( RewindInputStream *s )
{
- char buf[2] = {0, 0};
+ string line;
+
URL url = s->getBaseUrl( );
while( s->good( ) && !s->eof( ) ) {
- buf[0] = s->get( );
- if( buf[0] ) {
- cout << buf;
+ getline( *s, line );
+ if( s->good( ) ) {
+ handleLine( line );
}
}
}