diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-07-23 16:44:07 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-07-23 16:44:07 +0200 |
commit | a57788acee59705418b96525410b84fbee2f405a (patch) | |
tree | 660a828a29f0f769638ce18c1b45c62bd602e012 | |
parent | ce77513807f47bc7af59c8320932d3348aeb99ea (diff) | |
download | crawler-a57788acee59705418b96525410b84fbee2f405a.tar.gz crawler-a57788acee59705418b96525410b84fbee2f405a.tar.bz2 |
added parsing of Sitemap in robots.txt
-rw-r--r-- | include/util/StringUtils.hpp | 8 | ||||
-rwxr-xr-x | src/GNUmakefile | 2 | ||||
-rwxr-xr-x | src/crawl/GNUmakefile | 5 | ||||
-rwxr-xr-x | src/libutil/GNUmakefile | 3 | ||||
-rwxr-xr-x | src/libutil/Makefile.W32 | 2 | ||||
-rw-r--r-- | src/libutil/StringUtils.cpp | 23 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/GNUmakefile | 7 | ||||
-rwxr-xr-x | src/modules/processor/robotstxt/Makefile.W32 | 2 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.cpp | 62 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.hpp | 14 |
10 files changed, 115 insertions, 13 deletions
diff --git a/include/util/StringUtils.hpp b/include/util/StringUtils.hpp new file mode 100644 index 0000000..c6b9f87 --- /dev/null +++ b/include/util/StringUtils.hpp @@ -0,0 +1,8 @@ +#ifndef __UTIL_STRING_UTILS_H +#define __UTIL_STRING_UTILS_H + +#include <string> + +bool stringicasecmp( const std::string &s1, const std::string &s2 ); + +#endif diff --git a/src/GNUmakefile b/src/GNUmakefile index db6ace8..b339ffc 100755 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -17,4 +17,4 @@ local_uninstall: local_test: run: - @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser $(TOPDIR)/src/crawl/crawl + @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser $(TOPDIR)/src/crawl/crawl diff --git a/src/crawl/GNUmakefile b/src/crawl/GNUmakefile index 5c61625..cda7513 100755 --- a/src/crawl/GNUmakefile +++ b/src/crawl/GNUmakefile @@ -15,10 +15,11 @@ INCLUDE_DIRS = \ INCLUDE_LDFLAGS = \ -L$(TOPDIR)/src/liblogger \ - -L$(TOPDIR)/src/libcrawler + -L$(TOPDIR)/src/libcrawler \ + -L$(TOPDIR)/src/libutil INCLUDE_LIBS = \ - -llogger -lcrawler + -llogger -lcrawler -lutil # openssl ifeq ($(WITH_SSL),1) diff --git a/src/libutil/GNUmakefile b/src/libutil/GNUmakefile index 9bb47f5..87cf711 100755 --- a/src/libutil/GNUmakefile +++ b/src/libutil/GNUmakefile @@ -9,7 +9,7 @@ INCLUDE_CPPFLAGS = \ INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ - -I$(TOPDIR)/include/util + -I$(TOPDIR)/include INCLUDE_LIBS = \ @@ -21,6 +21,7 @@ DYNAMIC_LIB_MINOR = 0 DYNAMIC_LIB_PATCH = 0 CPP_OBJS = \ + StringUtils.o -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/libutil/Makefile.W32 b/src/libutil/Makefile.W32 index 2a5630f..c4c8f83 100755 --- a/src/libutil/Makefile.W32 +++ b/src/libutil/Makefile.W32 @@ -17,10 +17,12 @@ INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ CPP_OBJS = \ + StringUtils.obj \ win32\errormsg.obj \ win32\stringutils.obj DYNAMIC_CPP_OBJS = \ + StringUtils.dllobj \ win32\errormsg.dllobj \ win32\stringutils.dllobj diff --git a/src/libutil/StringUtils.cpp b/src/libutil/StringUtils.cpp new file mode 100644 index 0000000..61769c9 --- /dev/null +++ b/src/libutil/StringUtils.cpp @@ -0,0 +1,23 @@ +#include "util/StringUtils.hpp" + +#include <algorithm> +#include <cctype> + +using namespace std; + +bool stringicasecmp( const string &s1, const string &s2 ) +{ + string::const_iterator i1 = s1.begin( ), e1 = s1.end( ), + i2 = s2.begin( ), e2 = s2.end( ); + + while( i1 != e1 && i2 != e2 ) { + if( toupper( *i1 ) != toupper( *i2 ) ) return false; + i1++; + i2++; + } + + if( i1 == e1 && i2 == e2 ) return true; + + return false; +} + diff --git a/src/modules/processor/robotstxt/GNUmakefile b/src/modules/processor/robotstxt/GNUmakefile index d52c92e..cbfe420 100644 --- a/src/modules/processor/robotstxt/GNUmakefile +++ b/src/modules/processor/robotstxt/GNUmakefile @@ -9,18 +9,17 @@ INCLUDE_DIRS = \ -I$(TOPDIR)/include/logger \ -I$(TOPDIR)/include/util \ -I$(TOPDIR)/include/module \ - -I$(TOPDIR)/include/crawler \ - -I$(TOPDIR)/streamhtmlparser + -I$(TOPDIR)/include/crawler INCLUDE_CXXFLAGS = \ INCLUDE_LDFLAGS = \ -L$(TOPDIR)/src/libcrawler \ - -L$(TOPDIR)/streamhtmlparser + -L$(TOPDIR)/src/libutil INCLUDE_LIBS = \ -lcrawler \ - -lstreamhtmlparser + -lutil DYNAMIC_MODULE = \ mod_processor_robotstxt.so diff --git a/src/modules/processor/robotstxt/Makefile.W32 b/src/modules/processor/robotstxt/Makefile.W32 index b67513a..ebf1e22 100755 --- a/src/modules/processor/robotstxt/Makefile.W32 +++ b/src/modules/processor/robotstxt/Makefile.W32 @@ -19,7 +19,7 @@ INCLUDE_DIRS = \ INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \ + $(TOPDIR)\src\libutil\util.lib \ $(TOPDIR)\src\libcrawler\crawler.lib \ $(TOPDIR)\src\liblogger\logger.lib diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp index 1b7dbc8..7a91465 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp @@ -1,8 +1,10 @@ #include "RobotsTxtProcessor.hpp" #include "Logger.hpp" +#include "StringUtils.hpp" #include <string> #include <cstring> +#include <algorithm> using namespace std; @@ -14,15 +16,67 @@ RobotsTxtProcessor::~RobotsTxtProcessor( ) { } +void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end ) +{ + while( *it == ' ' && it != end ) it++; +} + +RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end ) +{ + string keyword; + while( *it != ':' && it != end ) { + keyword.push_back( *it ); + it++; + } + if( it == end ) return NoKeyword; + if( *it == ':' ) it++; + + if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword; + if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword; + if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword; + if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword; + return UnknownKeyword; +} + +void RobotsTxtProcessor::handleLine( const string &line ) +{ + string::const_iterator s = line.begin( ); + skipSpaces( s, line.end( ) ); + if( *s == '#' ) return; + KeywordType key = getKeyword( s, line.end( ) ); + switch( key ) { + case UserAgentKeyword: + case DisallowKeyword: + case CrawlDelayKeyword: + break; + + case SitemapKeyword: { + skipSpaces( s, line.end( ) ); + string sitemap = string( s, line.end( ) ); + LOG( logINFO ) << "Found Sitemap '" << sitemap << "'"; + } + break; + + case UnknownKeyword: + LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'"; + break; + + case NoKeyword: + LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'"; + break; + } +} + void RobotsTxtProcessor::process( RewindInputStream *s ) { - char buf[2] = {0, 0}; + string line; + URL url = s->getBaseUrl( ); while( s->good( ) && !s->eof( ) ) { - buf[0] = s->get( ); - if( buf[0] ) { - cout << buf; + getline( *s, line ); + if( s->good( ) ) { + handleLine( line ); } } } diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp index a274f2b..532c741 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp @@ -9,6 +9,20 @@ class RobotsTxtProcessor : public Processor { RobotsTxtProcessor( ); virtual ~RobotsTxtProcessor( ); virtual void process( RewindInputStream *s ); + + typedef enum { + UserAgentKeyword, + DisallowKeyword, + CrawlDelayKeyword, + SitemapKeyword, + UnknownKeyword, + NoKeyword + } KeywordType; + + private: + KeywordType getKeyword( string::const_iterator &it, string::const_iterator end ); + void skipSpaces( string::const_iterator &it, string::const_iterator end ); + void handleLine( const std::string &s ); }; DECLARE_MODULE( Processor ) |