summaryrefslogtreecommitdiff
path: root/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
blob: 6cc8a7e6d1f7c2406a13f83fc85b4d39b1d5dcae (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#include "RobotsTxtProcessor.hpp"
#include "Logger.hpp"
#include "StringUtils.hpp"

#include <string>
#include <cstring>
#include <algorithm> 

using namespace std;

RobotsTxtProcessor::RobotsTxtProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *urlFilter, URLSeen *urlSeen )
	: m_normalizer( normalizer ), m_frontier( frontier ), m_urlFilter( urlFilter ), m_urlSeen( urlSeen )
{
}

RobotsTxtProcessor::~RobotsTxtProcessor( )
{
}

void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end )
{
	while( *it == ' ' && it != end ) it++;
}

RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end )
{
	string keyword;
	while( *it != ':' && it != end ) {
		keyword.push_back( *it );
		it++;
	}
	if( it == end ) return NoKeyword;
	if( *it == ':' ) it++;
	
	if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword;
	if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword;
	if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword;
	if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword;
	return UnknownKeyword;
}

void RobotsTxtProcessor::handleLine( const string &line )
{
	string::const_iterator s = line.begin( );
	skipSpaces( s, line.end( ) );
	if( *s == '#' ) return;
	KeywordType key = getKeyword( s, line.end( ) );
	switch( key ) {
		case UserAgentKeyword:
		case DisallowKeyword:
		case CrawlDelayKeyword:
			break;
			
		case SitemapKeyword: {
			skipSpaces( s, line.end( ) );
			string sitemap = string( s, line.end( ) );
			LOG( logINFO ) << "Found sitemap '" << sitemap << "'";
			URL sitemapLink = m_normalizer->parseUrl( sitemap );
			if( !m_urlSeen->seen( sitemapLink ) ) {
				m_frontier->addUrl( sitemapLink );
			}
			}
			break;
			
		case UnknownKeyword:
			LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'";
			break;
			
		case NoKeyword:
			LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'";
			break;
	}
}

void RobotsTxtProcessor::process( RewindInputStream *s )
{
	string line;

	while( s->good( ) && !s->eof( ) ) {
		getline( *s, line );
		if( s->good( ) ) {
			handleLine( line );
		}
	}
}

REGISTER_MODULE_4( "robotstxt_processor", 0, 0, Processor, RobotsTxtProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )