From a57788acee59705418b96525410b84fbee2f405a Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Wed, 23 Jul 2014 16:44:07 +0200 Subject: added parsing of Sitemap in robots.txt --- include/util/StringUtils.hpp | 8 +++ src/GNUmakefile | 2 +- src/crawl/GNUmakefile | 5 +- src/libutil/GNUmakefile | 3 +- src/libutil/Makefile.W32 | 2 + src/libutil/StringUtils.cpp | 23 ++++++++ src/modules/processor/robotstxt/GNUmakefile | 7 ++- src/modules/processor/robotstxt/Makefile.W32 | 2 +- .../processor/robotstxt/RobotsTxtProcessor.cpp | 62 ++++++++++++++++++++-- .../processor/robotstxt/RobotsTxtProcessor.hpp | 14 +++++ 10 files changed, 115 insertions(+), 13 deletions(-) create mode 100644 include/util/StringUtils.hpp create mode 100644 src/libutil/StringUtils.cpp diff --git a/include/util/StringUtils.hpp b/include/util/StringUtils.hpp new file mode 100644 index 0000000..c6b9f87 --- /dev/null +++ b/include/util/StringUtils.hpp @@ -0,0 +1,8 @@ +#ifndef __UTIL_STRING_UTILS_H +#define __UTIL_STRING_UTILS_H + +#include + +bool stringicasecmp( const std::string &s1, const std::string &s2 ); + +#endif diff --git a/src/GNUmakefile b/src/GNUmakefile index db6ace8..b339ffc 100755 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -17,4 +17,4 @@ local_uninstall: local_test: run: - @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser $(TOPDIR)/src/crawl/crawl + @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser $(TOPDIR)/src/crawl/crawl diff --git a/src/crawl/GNUmakefile b/src/crawl/GNUmakefile index 5c61625..cda7513 100755 --- a/src/crawl/GNUmakefile +++ b/src/crawl/GNUmakefile @@ -15,10 +15,11 @@ INCLUDE_DIRS = \ INCLUDE_LDFLAGS = \ -L$(TOPDIR)/src/liblogger \ - -L$(TOPDIR)/src/libcrawler + -L$(TOPDIR)/src/libcrawler \ + -L$(TOPDIR)/src/libutil INCLUDE_LIBS = \ - -llogger -lcrawler + -llogger -lcrawler -lutil # openssl ifeq ($(WITH_SSL),1) diff --git a/src/libutil/GNUmakefile b/src/libutil/GNUmakefile index 9bb47f5..87cf711 100755 --- a/src/libutil/GNUmakefile +++ b/src/libutil/GNUmakefile @@ -9,7 +9,7 @@ INCLUDE_CPPFLAGS = \ INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ - -I$(TOPDIR)/include/util + -I$(TOPDIR)/include INCLUDE_LIBS = \ @@ -21,6 +21,7 @@ DYNAMIC_LIB_MINOR = 0 DYNAMIC_LIB_PATCH = 0 CPP_OBJS = \ + StringUtils.o -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/libutil/Makefile.W32 b/src/libutil/Makefile.W32 index 2a5630f..c4c8f83 100755 --- a/src/libutil/Makefile.W32 +++ b/src/libutil/Makefile.W32 @@ -17,10 +17,12 @@ INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ CPP_OBJS = \ + StringUtils.obj \ win32\errormsg.obj \ win32\stringutils.obj DYNAMIC_CPP_OBJS = \ + StringUtils.dllobj \ win32\errormsg.dllobj \ win32\stringutils.dllobj diff --git a/src/libutil/StringUtils.cpp b/src/libutil/StringUtils.cpp new file mode 100644 index 0000000..61769c9 --- /dev/null +++ b/src/libutil/StringUtils.cpp @@ -0,0 +1,23 @@ +#include "util/StringUtils.hpp" + +#include +#include + +using namespace std; + +bool stringicasecmp( const string &s1, const string &s2 ) +{ + string::const_iterator i1 = s1.begin( ), e1 = s1.end( ), + i2 = s2.begin( ), e2 = s2.end( ); + + while( i1 != e1 && i2 != e2 ) { + if( toupper( *i1 ) != toupper( *i2 ) ) return false; + i1++; + i2++; + } + + if( i1 == e1 && i2 == e2 ) return true; + + return false; +} + diff --git a/src/modules/processor/robotstxt/GNUmakefile b/src/modules/processor/robotstxt/GNUmakefile index d52c92e..cbfe420 100644 --- a/src/modules/processor/robotstxt/GNUmakefile +++ b/src/modules/processor/robotstxt/GNUmakefile @@ -9,18 +9,17 @@ INCLUDE_DIRS = \ -I$(TOPDIR)/include/logger \ -I$(TOPDIR)/include/util \ -I$(TOPDIR)/include/module \ - -I$(TOPDIR)/include/crawler \ - -I$(TOPDIR)/streamhtmlparser + -I$(TOPDIR)/include/crawler INCLUDE_CXXFLAGS = \ INCLUDE_LDFLAGS = \ -L$(TOPDIR)/src/libcrawler \ - -L$(TOPDIR)/streamhtmlparser + -L$(TOPDIR)/src/libutil INCLUDE_LIBS = \ -lcrawler \ - -lstreamhtmlparser + -lutil DYNAMIC_MODULE = \ mod_processor_robotstxt.so diff --git a/src/modules/processor/robotstxt/Makefile.W32 b/src/modules/processor/robotstxt/Makefile.W32 index b67513a..ebf1e22 100755 --- a/src/modules/processor/robotstxt/Makefile.W32 +++ b/src/modules/processor/robotstxt/Makefile.W32 @@ -19,7 +19,7 @@ INCLUDE_DIRS = \ INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \ + $(TOPDIR)\src\libutil\util.lib \ $(TOPDIR)\src\libcrawler\crawler.lib \ $(TOPDIR)\src\liblogger\logger.lib diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp index 1b7dbc8..7a91465 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp @@ -1,8 +1,10 @@ #include "RobotsTxtProcessor.hpp" #include "Logger.hpp" +#include "StringUtils.hpp" #include #include +#include using namespace std; @@ -14,15 +16,67 @@ RobotsTxtProcessor::~RobotsTxtProcessor( ) { } +void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end ) +{ + while( *it == ' ' && it != end ) it++; +} + +RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end ) +{ + string keyword; + while( *it != ':' && it != end ) { + keyword.push_back( *it ); + it++; + } + if( it == end ) return NoKeyword; + if( *it == ':' ) it++; + + if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword; + if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword; + if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword; + if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword; + return UnknownKeyword; +} + +void RobotsTxtProcessor::handleLine( const string &line ) +{ + string::const_iterator s = line.begin( ); + skipSpaces( s, line.end( ) ); + if( *s == '#' ) return; + KeywordType key = getKeyword( s, line.end( ) ); + switch( key ) { + case UserAgentKeyword: + case DisallowKeyword: + case CrawlDelayKeyword: + break; + + case SitemapKeyword: { + skipSpaces( s, line.end( ) ); + string sitemap = string( s, line.end( ) ); + LOG( logINFO ) << "Found Sitemap '" << sitemap << "'"; + } + break; + + case UnknownKeyword: + LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'"; + break; + + case NoKeyword: + LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'"; + break; + } +} + void RobotsTxtProcessor::process( RewindInputStream *s ) { - char buf[2] = {0, 0}; + string line; + URL url = s->getBaseUrl( ); while( s->good( ) && !s->eof( ) ) { - buf[0] = s->get( ); - if( buf[0] ) { - cout << buf; + getline( *s, line ); + if( s->good( ) ) { + handleLine( line ); } } } diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp index a274f2b..532c741 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp @@ -9,6 +9,20 @@ class RobotsTxtProcessor : public Processor { RobotsTxtProcessor( ); virtual ~RobotsTxtProcessor( ); virtual void process( RewindInputStream *s ); + + typedef enum { + UserAgentKeyword, + DisallowKeyword, + CrawlDelayKeyword, + SitemapKeyword, + UnknownKeyword, + NoKeyword + } KeywordType; + + private: + KeywordType getKeyword( string::const_iterator &it, string::const_iterator end ); + void skipSpaces( string::const_iterator &it, string::const_iterator end ); + void handleLine( const std::string &s ); }; DECLARE_MODULE( Processor ) -- cgit v1.2.3-54-g00ecf