summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2014-07-23 16:44:07 +0200
committerAndreas Baumann <abaumann@yahoo.com>2014-07-23 16:44:07 +0200
commita57788acee59705418b96525410b84fbee2f405a (patch)
tree660a828a29f0f769638ce18c1b45c62bd602e012
parentce77513807f47bc7af59c8320932d3348aeb99ea (diff)
downloadcrawler-a57788acee59705418b96525410b84fbee2f405a.tar.gz
crawler-a57788acee59705418b96525410b84fbee2f405a.tar.bz2
added parsing of Sitemap in robots.txt
-rw-r--r--include/util/StringUtils.hpp8
-rwxr-xr-xsrc/GNUmakefile2
-rwxr-xr-xsrc/crawl/GNUmakefile5
-rwxr-xr-xsrc/libutil/GNUmakefile3
-rwxr-xr-xsrc/libutil/Makefile.W322
-rw-r--r--src/libutil/StringUtils.cpp23
-rw-r--r--src/modules/processor/robotstxt/GNUmakefile7
-rwxr-xr-xsrc/modules/processor/robotstxt/Makefile.W322
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.cpp62
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.hpp14
10 files changed, 115 insertions, 13 deletions
diff --git a/include/util/StringUtils.hpp b/include/util/StringUtils.hpp
new file mode 100644
index 0000000..c6b9f87
--- /dev/null
+++ b/include/util/StringUtils.hpp
@@ -0,0 +1,8 @@
+#ifndef __UTIL_STRING_UTILS_H
+#define __UTIL_STRING_UTILS_H
+
+#include <string>
+
+bool stringicasecmp( const std::string &s1, const std::string &s2 );
+
+#endif
diff --git a/src/GNUmakefile b/src/GNUmakefile
index db6ace8..b339ffc 100755
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -17,4 +17,4 @@ local_uninstall:
local_test:
run:
- @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser $(TOPDIR)/src/crawl/crawl
+ @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/libutil:$(TOPDIR)/src/liblogger:$(TOPDIR)/src/libcrawler:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser $(TOPDIR)/src/crawl/crawl
diff --git a/src/crawl/GNUmakefile b/src/crawl/GNUmakefile
index 5c61625..cda7513 100755
--- a/src/crawl/GNUmakefile
+++ b/src/crawl/GNUmakefile
@@ -15,10 +15,11 @@ INCLUDE_DIRS = \
INCLUDE_LDFLAGS = \
-L$(TOPDIR)/src/liblogger \
- -L$(TOPDIR)/src/libcrawler
+ -L$(TOPDIR)/src/libcrawler \
+ -L$(TOPDIR)/src/libutil
INCLUDE_LIBS = \
- -llogger -lcrawler
+ -llogger -lcrawler -lutil
# openssl
ifeq ($(WITH_SSL),1)
diff --git a/src/libutil/GNUmakefile b/src/libutil/GNUmakefile
index 9bb47f5..87cf711 100755
--- a/src/libutil/GNUmakefile
+++ b/src/libutil/GNUmakefile
@@ -9,7 +9,7 @@ INCLUDE_CPPFLAGS = \
INCLUDE_LDFLAGS = \
INCLUDE_DIRS = \
- -I$(TOPDIR)/include/util
+ -I$(TOPDIR)/include
INCLUDE_LIBS = \
@@ -21,6 +21,7 @@ DYNAMIC_LIB_MINOR = 0
DYNAMIC_LIB_PATCH = 0
CPP_OBJS = \
+ StringUtils.o
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/libutil/Makefile.W32 b/src/libutil/Makefile.W32
index 2a5630f..c4c8f83 100755
--- a/src/libutil/Makefile.W32
+++ b/src/libutil/Makefile.W32
@@ -17,10 +17,12 @@ INCLUDE_LDFLAGS = \
INCLUDE_LIBS = \
CPP_OBJS = \
+ StringUtils.obj \
win32\errormsg.obj \
win32\stringutils.obj
DYNAMIC_CPP_OBJS = \
+ StringUtils.dllobj \
win32\errormsg.dllobj \
win32\stringutils.dllobj
diff --git a/src/libutil/StringUtils.cpp b/src/libutil/StringUtils.cpp
new file mode 100644
index 0000000..61769c9
--- /dev/null
+++ b/src/libutil/StringUtils.cpp
@@ -0,0 +1,23 @@
+#include "util/StringUtils.hpp"
+
+#include <algorithm>
+#include <cctype>
+
+using namespace std;
+
+bool stringicasecmp( const string &s1, const string &s2 )
+{
+ string::const_iterator i1 = s1.begin( ), e1 = s1.end( ),
+ i2 = s2.begin( ), e2 = s2.end( );
+
+ while( i1 != e1 && i2 != e2 ) {
+ if( toupper( *i1 ) != toupper( *i2 ) ) return false;
+ i1++;
+ i2++;
+ }
+
+ if( i1 == e1 && i2 == e2 ) return true;
+
+ return false;
+}
+
diff --git a/src/modules/processor/robotstxt/GNUmakefile b/src/modules/processor/robotstxt/GNUmakefile
index d52c92e..cbfe420 100644
--- a/src/modules/processor/robotstxt/GNUmakefile
+++ b/src/modules/processor/robotstxt/GNUmakefile
@@ -9,18 +9,17 @@ INCLUDE_DIRS = \
-I$(TOPDIR)/include/logger \
-I$(TOPDIR)/include/util \
-I$(TOPDIR)/include/module \
- -I$(TOPDIR)/include/crawler \
- -I$(TOPDIR)/streamhtmlparser
+ -I$(TOPDIR)/include/crawler
INCLUDE_CXXFLAGS = \
INCLUDE_LDFLAGS = \
-L$(TOPDIR)/src/libcrawler \
- -L$(TOPDIR)/streamhtmlparser
+ -L$(TOPDIR)/src/libutil
INCLUDE_LIBS = \
-lcrawler \
- -lstreamhtmlparser
+ -lutil
DYNAMIC_MODULE = \
mod_processor_robotstxt.so
diff --git a/src/modules/processor/robotstxt/Makefile.W32 b/src/modules/processor/robotstxt/Makefile.W32
index b67513a..ebf1e22 100755
--- a/src/modules/processor/robotstxt/Makefile.W32
+++ b/src/modules/processor/robotstxt/Makefile.W32
@@ -19,7 +19,7 @@ INCLUDE_DIRS = \
INCLUDE_LDFLAGS = \
INCLUDE_LIBS = \
- $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \
+ $(TOPDIR)\src\libutil\util.lib \
$(TOPDIR)\src\libcrawler\crawler.lib \
$(TOPDIR)\src\liblogger\logger.lib
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
index 1b7dbc8..7a91465 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
@@ -1,8 +1,10 @@
#include "RobotsTxtProcessor.hpp"
#include "Logger.hpp"
+#include "StringUtils.hpp"
#include <string>
#include <cstring>
+#include <algorithm>
using namespace std;
@@ -14,15 +16,67 @@ RobotsTxtProcessor::~RobotsTxtProcessor( )
{
}
+void RobotsTxtProcessor::skipSpaces( string::const_iterator &it, string::const_iterator end )
+{
+ while( *it == ' ' && it != end ) it++;
+}
+
+RobotsTxtProcessor::KeywordType RobotsTxtProcessor::getKeyword( string::const_iterator &it, string::const_iterator end )
+{
+ string keyword;
+ while( *it != ':' && it != end ) {
+ keyword.push_back( *it );
+ it++;
+ }
+ if( it == end ) return NoKeyword;
+ if( *it == ':' ) it++;
+
+ if( stringicasecmp( keyword, "User-agent" ) ) return UserAgentKeyword;
+ if( stringicasecmp( keyword, "Disallow" ) ) return DisallowKeyword;
+ if( stringicasecmp( keyword, "Crawl-delay" ) ) return CrawlDelayKeyword;
+ if( stringicasecmp( keyword, "Sitemap" ) ) return SitemapKeyword;
+ return UnknownKeyword;
+}
+
+void RobotsTxtProcessor::handleLine( const string &line )
+{
+ string::const_iterator s = line.begin( );
+ skipSpaces( s, line.end( ) );
+ if( *s == '#' ) return;
+ KeywordType key = getKeyword( s, line.end( ) );
+ switch( key ) {
+ case UserAgentKeyword:
+ case DisallowKeyword:
+ case CrawlDelayKeyword:
+ break;
+
+ case SitemapKeyword: {
+ skipSpaces( s, line.end( ) );
+ string sitemap = string( s, line.end( ) );
+ LOG( logINFO ) << "Found Sitemap '" << sitemap << "'";
+ }
+ break;
+
+ case UnknownKeyword:
+ LOG( logWARNING ) << "Ignoring unknown keyword in '" << line << "'";
+ break;
+
+ case NoKeyword:
+ LOG( logWARNING ) << "Ingoring syntax error in '" << line << "'";
+ break;
+ }
+}
+
void RobotsTxtProcessor::process( RewindInputStream *s )
{
- char buf[2] = {0, 0};
+ string line;
+
URL url = s->getBaseUrl( );
while( s->good( ) && !s->eof( ) ) {
- buf[0] = s->get( );
- if( buf[0] ) {
- cout << buf;
+ getline( *s, line );
+ if( s->good( ) ) {
+ handleLine( line );
}
}
}
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
index a274f2b..532c741 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
@@ -9,6 +9,20 @@ class RobotsTxtProcessor : public Processor {
RobotsTxtProcessor( );
virtual ~RobotsTxtProcessor( );
virtual void process( RewindInputStream *s );
+
+ typedef enum {
+ UserAgentKeyword,
+ DisallowKeyword,
+ CrawlDelayKeyword,
+ SitemapKeyword,
+ UnknownKeyword,
+ NoKeyword
+ } KeywordType;
+
+ private:
+ KeywordType getKeyword( string::const_iterator &it, string::const_iterator end );
+ void skipSpaces( string::const_iterator &it, string::const_iterator end );
+ void handleLine( const std::string &s );
};
DECLARE_MODULE( Processor )