summaryrefslogtreecommitdiff
path: root/src/modules
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2014-07-23 16:44:07 +0200
committerAndreas Baumann <abaumann@yahoo.com>2014-07-23 16:44:07 +0200
commita57788acee59705418b96525410b84fbee2f405a (patch)
tree660a828a29f0f769638ce18c1b45c62bd602e012 /src/modules
parentce77513807f47bc7af59c8320932d3348aeb99ea (diff)
downloadcrawler-a57788acee59705418b96525410b84fbee2f405a.tar.gz
crawler-a57788acee59705418b96525410b84fbee2f405a.tar.bz2
added parsing of Sitemap in robots.txt
Diffstat (limited to 'src/modules')
-rw-r--r--src/modules/processor/robotstxt/GNUmakefile7
-rwxr-xr-xsrc/modules/processor/robotstxt/Makefile.W322
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.cpp62
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.hpp14
4 files changed, 76 insertions, 9 deletions
diff --git a/src/modules/processor/robotstxt/GNUmakefile b/src/modules/processor/robotstxt/GNUmakefile
index d52c92e..cbfe420 100644
--- a/src/modules/processor/robotstxt/GNUmakefile
+++ b/src/modules/processor/robotstxt/GNUmakefile
@@ -9,18 +9,17 @@ INCLUDE_DIRS = \
-I$(TOPDIR)/include/logger \
-I$(TOPDIR)/include/util \
-I$(TOPDIR)/include/module \
- -I$(TOPDIR)/include/crawler \
- -I$(TOPDIR)/streamhtmlparser
+ -I$(TOPDIR)/include/crawler
INCLUDE_CXXFLAGS = \
INCLUDE_LDFLAGS = \
-L$(TOPDIR)/src/libcrawler \
- -L$(TOPDIR)/streamhtmlparser
+ -L$(TOPDIR)/src/libutil
INCLUDE_LIBS = \
-lcrawler \
- -lstreamhtmlparser
+ -lutil
DYNAMIC_MODULE = \
mod_processor_robotstxt.so
diff --git a/src/modules/processor/robotstxt/Makefile.W32 b/src/modules/processor/robotstxt/Makefile.W32
index b67513a..ebf1e22 100755
--- a/src/modules/processor/robotstxt/Makefile.W32
+++ b/src/modules/processor/robotstxt/Makefile.W32
@@ -19,7 +19,7 @@ INCLUDE_DIRS = \
INCLUDE_LDFLAGS = \
INCLUDE_LIBS = \
- $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \
+ $(TOPDIR)\src\libutil\util.lib \
$(TOPDIR)\src\libcrawler\crawler.lib \
$(TOPDIR)\src\liblogger\logger.lib
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
index 1b7dbc8..7a91465 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
@@ -1,8 +1,10 @@
#include "RobotsTxtProcessor.hpp"
#include "Logger.hpp"
+#include "StringUtils.hpp"
#include <string>
#include <cstring>
+#include <algorithm>
using namespace std;
@@ -14,15 +16,67 @@ RobotsTxtProcessor::~RobotsTxtProcessor( )
{
}
+void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end )
+{
+ while( *it == ' ' && it != end ) it++;
+}
+
+RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end )
+{
+ string keyword;
+ while( *it != ':' && it != end ) {
+ keyword.push_back( *it );
+ it++;
+ }
+ if( it == end ) return NoKeyword;
+ if( *it == ':' ) it++;
+
+ if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword;
+ if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword;
+ if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword;
+ if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword;
+ return UnknownKeyword;
+}
+
+void RobotsTxtProcessor::handleLine( const string &line )
+{
+ string::const_iterator s = line.begin( );
+ skipSpaces( s, line.end( ) );
+ if( *s == '#' ) return;
+ KeywordType key = getKeyword( s, line.end( ) );
+ switch( key ) {
+ case UserAgentKeyword:
+ case DisallowKeyword:
+ case CrawlDelayKeyword:
+ break;
+
+ case SitemapKeyword: {
+ skipSpaces( s, line.end( ) );
+ string sitemap = string( s, line.end( ) );
+ LOG( logINFO ) << "Found Sitemap '" << sitemap << "'";
+ }
+ break;
+
+ case UnknownKeyword:
+ LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'";
+ break;
+
+ case NoKeyword:
+ LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'";
+ break;
+ }
+}
+
void RobotsTxtProcessor::process( RewindInputStream *s )
{
- char buf[2] = {0, 0};
+ string line;
+
URL url = s->getBaseUrl( );
while( s->good( ) && !s->eof( ) ) {
- buf[0] = s->get( );
- if( buf[0] ) {
- cout << buf;
+ getline( *s, line );
+ if( s->good( ) ) {
+ handleLine( line );
}
}
}
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
index a274f2b..532c741 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
@@ -9,6 +9,20 @@ class RobotsTxtProcessor : public Processor {
RobotsTxtProcessor( );
virtual ~RobotsTxtProcessor( );
virtual void process( RewindInputStream *s );
+
+ typedef enum {
+ UserAgentKeyword,
+ DisallowKeyword,
+ CrawlDelayKeyword,
+ SitemapKeyword,
+ UnknownKeyword,
+ NoKeyword
+ } KeywordType;
+
+ private:
+ KeywordType getKeyword( string::const_iterator &it, string::const_iterator end );
+ void skipSpaces( string::const_iterator &it, string::const_iterator end );
+ void handleLine( const std::string &s );
};
DECLARE_MODULE( Processor )