1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
#include "RobotsTxtProcessor.hpp"
#include "Logger.hpp"
#include "StringUtils.hpp"
#include <string>
#include <cstring>
#include <algorithm>
using namespace std;
RobotsTxtProcessor::RobotsTxtProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *urlFilter, URLSeen *urlSeen )
: m_normalizer( normalizer ), m_frontier( frontier ), m_urlFilter( urlFilter ), m_urlSeen( urlSeen )
{
}
RobotsTxtProcessor::~RobotsTxtProcessor( )
{
}
void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end )
{
while( *it == ' ' && it != end ) it++;
}
RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end )
{
string keyword;
while( *it != ':' && it != end ) {
keyword.push_back( *it );
it++;
}
if( it == end ) return NoKeyword;
if( *it == ':' ) it++;
if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword;
if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword;
if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword;
if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword;
return UnknownKeyword;
}
void RobotsTxtProcessor::handleLine( const string &line )
{
string::const_iterator s = line.begin( );
skipSpaces( s, line.end( ) );
if( *s == '#' ) return;
KeywordType key = getKeyword( s, line.end( ) );
switch( key ) {
case UserAgentKeyword:
case DisallowKeyword:
case CrawlDelayKeyword:
break;
case SitemapKeyword: {
skipSpaces( s, line.end( ) );
string sitemap = string( s, line.end( ) );
LOG( logINFO ) << "Found sitemap '" << sitemap << "'";
URL sitemapLink = m_normalizer->parseUrl( sitemap );
if( !m_urlSeen->seen( sitemapLink ) ) {
m_frontier->addUrl( sitemapLink );
}
}
break;
case UnknownKeyword:
LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'";
break;
case NoKeyword:
LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'";
break;
}
}
void RobotsTxtProcessor::process( RewindInputStream *s )
{
string line;
while( s->good( ) && !s->eof( ) ) {
getline( *s, line );
if( s->good( ) ) {
handleLine( line );
}
}
}
REGISTER_MODULE_4( "robotstxt_processor", 0, 0, Processor, RobotsTxtProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
|