summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2014-07-24 13:12:37 +0200
committerAndreas Baumann <abaumann@yahoo.com>2014-07-24 13:12:37 +0200
commitaa7fa3ef71b4e9193088b67c9b34448c00a8f949 (patch)
tree47c88c4189f025a809a61a906bd636e5a05372ed /src
parenta57788acee59705418b96525410b84fbee2f405a (diff)
downloadcrawler-aa7fa3ef71b4e9193088b67c9b34448c00a8f949.tar.gz
crawler-aa7fa3ef71b4e9193088b67c9b34448c00a8f949.tar.bz2
sitemap processing (work in progress)
Diffstat (limited to 'src')
-rwxr-xr-xsrc/crawl/crawl.cpp16
-rw-r--r--src/modules/processor/GNUmakefile2
-rwxr-xr-xsrc/modules/processor/Makefile.W322
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.cpp11
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.hpp14
-rw-r--r--src/modules/processor/sitemap/GNUmakefile44
-rwxr-xr-xsrc/modules/processor/sitemap/Makefile.W3252
-rw-r--r--src/modules/processor/sitemap/SitemapProcessor.cpp71
-rw-r--r--src/modules/processor/sitemap/SitemapProcessor.hpp30
9 files changed, 232 insertions, 10 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index ecc8f16..4899d0f 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -55,7 +55,8 @@ BOOL WINAPI termHandler( DWORD ctrlType )
int main( void )
{
try {
- Logger::instance( ).openConsoleLog( logINFO );
+// Logger::instance( ).openConsoleLog( logINFO );
+ Logger::instance( ).openConsoleLog( logDEBUG );
#ifndef _WIN32
struct sigaction sa;
@@ -135,9 +136,11 @@ int main( void )
#ifndef _WIN32
processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" );
processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" );
+ processorModules.push_back( "./modules/processor/sitemap/mod_processor_sitemap.so" );
#else
processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" );
processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" );
+ processorModules.push_back( ".\\modules\\processor\\sitemap\\mod_processor_sitemap.dll" );
#endif
ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules );
@@ -179,9 +182,12 @@ int main( void )
Processor *htmlParser = processors.create( "htmllinkextract_processor",
normalizer, frontier, chainFilter, urlSeen );
-
+
Processor *robotsTxtParser = processors.create( "robotstxt_processor",
normalizer, frontier, chainFilter, urlSeen );
+
+ Processor *sitemapParser = processors.create( "sitemap_processor",
+ normalizer, frontier, chainFilter, urlSeen );
LOG( logNOTICE ) << "Crawler started..";
@@ -206,7 +212,8 @@ int main( void )
#ifndef _WIN32
MIMEType mimeType = typeDetect->detect( s );
- if( mimeType != MIMEType::Null ) {
+ if( mimeType != MIMEType::Null ) {
+ LOG( logDEBUG ) << "MIME type of '" << url << "' is '" << mimeType << "'";
if( mimeType == "text/html" ) {
s->rewind( );
htmlParser->process( s );
@@ -219,6 +226,9 @@ int main( void )
s->rewind( );
robotsTxtParser->process( s );
}
+ } else if( mimeType == "text/xml" ) {
+ s->rewind( );
+ sitemapParser->process( s );
}
}
#else
diff --git a/src/modules/processor/GNUmakefile b/src/modules/processor/GNUmakefile
index 8b91967..48a733a 100644
--- a/src/modules/processor/GNUmakefile
+++ b/src/modules/processor/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ../../..
-SUBDIRS = htmllinkextract robotstxt
+SUBDIRS = htmllinkextract robotstxt sitemap
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/processor/Makefile.W32 b/src/modules/processor/Makefile.W32
index 530fd98..0888560 100755
--- a/src/modules/processor/Makefile.W32
+++ b/src/modules/processor/Makefile.W32
@@ -1,6 +1,6 @@
TOPDIR = ..\..\..
-SUBDIRS = htmllinkextract robotstxt
+SUBDIRS = htmllinkextract robotstxt sitemap
!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
index 7a91465..ec5bf20 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
@@ -8,7 +8,8 @@
using namespace std;
-RobotsTxtProcessor::RobotsTxtProcessor( )
+RobotsTxtProcessor::RobotsTxtProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *urlFilter, URLSeen *urlSeen )
+ : m_normalizer( normalizer ), m_frontier( frontier ), m_urlFilter( urlFilter ), m_urlSeen( urlSeen )
{
}
@@ -53,7 +54,11 @@ void RobotsTxtProcessor::handleLine( const string &line )
case SitemapKeyword: {
skipSpaces( s, line.end( ) );
string sitemap = string( s, line.end( ) );
- LOG( logINFO ) << "Found Sitemap '" << sitemap << "'";
+ LOG( logINFO ) << "Found sitemap '" << sitemap << "'";
+ URL sitemapLink = m_normalizer->parseUrl( sitemap );
+ if( !m_urlSeen->seen( sitemapLink ) ) {
+ m_frontier->addUrl( sitemapLink );
+ }
}
break;
@@ -81,4 +86,4 @@ void RobotsTxtProcessor::process( RewindInputStream *s )
}
}
-REGISTER_MODULE( "robotstxt_processor", Processor, RobotsTxtProcessor )
+REGISTER_MODULE_4( "robotstxt_processor", Processor, RobotsTxtProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
index 532c741..7db6302 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp
@@ -3,10 +3,14 @@
#include "Processor.hpp"
#include "ModuleRegistry.hpp"
+#include "URLNormalizer.hpp"
+#include "Frontier.hpp"
+#include "URLFilter.hpp"
+#include "URLSeen.hpp"
class RobotsTxtProcessor : public Processor {
public:
- RobotsTxtProcessor( );
+ RobotsTxtProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *urlFilter, URLSeen *urlSeen );
virtual ~RobotsTxtProcessor( );
virtual void process( RewindInputStream *s );
@@ -19,12 +23,18 @@ class RobotsTxtProcessor : public Processor {
NoKeyword
} KeywordType;
+ protected:
+ URLNormalizer *m_normalizer;
+ Frontier *m_frontier;
+ URLFilter *m_urlFilter;
+ URLSeen *m_urlSeen;
+
private:
KeywordType getKeyword( string::const_iterator &it, string::const_iterator end );
void skipSpaces( string::const_iterator &it, string::const_iterator end );
void handleLine( const std::string &s );
};
-DECLARE_MODULE( Processor )
+DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
#endif
diff --git a/src/modules/processor/sitemap/GNUmakefile b/src/modules/processor/sitemap/GNUmakefile
new file mode 100644
index 0000000..f206b35
--- /dev/null
+++ b/src/modules/processor/sitemap/GNUmakefile
@@ -0,0 +1,44 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src \
+ -I$(TOPDIR)/include/logger \
+ -I$(TOPDIR)/include/util \
+ -I$(TOPDIR)/include/module \
+ -I$(TOPDIR)/include/crawler
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+ -L$(TOPDIR)/src/libcrawler
+
+INCLUDE_LIBS = \
+ -lcrawler
+
+DYNAMIC_MODULE = \
+ mod_processor_sitemap.so
+
+STATIC_LIB = \
+ libsitemapprocessor.a
+
+CPP_OBJS = \
+ SitemapProcessor.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/processor/sitemap/Makefile.W32 b/src/modules/processor/sitemap/Makefile.W32
new file mode 100755
index 0000000..115b4c8
--- /dev/null
+++ b/src/modules/processor/sitemap/Makefile.W32
@@ -0,0 +1,52 @@
+TOPDIR = ..\..\..\..
+
+SUBDIRS =
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
+
+INCLUDE_CXXFLAGS = \
+ /D_WIN32_WINNT=0x504
+
+INCLUDE_DIRS = \
+ /I. \
+ /I$(TOPDIR)\src \
+ /I$(TOPDIR)\include\module \
+ /I$(TOPDIR)\include\util \
+ /I$(TOPDIR)\include\logger \
+ /I$(TOPDIR)\include\crawler
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)\src\libcrawler\crawler.lib \
+ $(TOPDIR)\src\liblogger\logger.lib
+
+DYNAMIC_MODULE = \
+ mod_processor_sitemap.dll
+
+STATIC_LIB = \
+ sitemapprocessor.lib
+
+CPP_OBJS = \
+ SitemapProcessor.obj
+
+SHARED_CPP_OBJS = \
+ HTMLLinkExtractProcessor.dllobj
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+$(STATIC_LIB): $(CPP_OBJS)
+ $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $?
+
+$(DYNAMIC_MODULE): $(SHARED_CPP_OBJS)
+ $(LINK) /dll /nologo /out:$@ $(LDFLAGS) $(LIBS) $?
+
+local_all: $(STATIC_LIB) $(DYNAMIC_MODULE)
+
+local_clean:
+ @-erase $(LOCAL_STATIC_LIB) 2>NUL
+ @-erase $(CPP_OBJS) 2>NUL
+
+local_distclean:
+
+local_test:
diff --git a/src/modules/processor/sitemap/SitemapProcessor.cpp b/src/modules/processor/sitemap/SitemapProcessor.cpp
new file mode 100644
index 0000000..f575a0c
--- /dev/null
+++ b/src/modules/processor/sitemap/SitemapProcessor.cpp
@@ -0,0 +1,71 @@
+#include "HTMLLinkExtractProcessor.hpp"
+#include "Logger.hpp"
+
+#include <string>
+#include <cstring>
+
+using namespace std;
+using namespace streamhtmlparser;
+
+HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen )
+ : m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ),
+ m_parser( ), m_baseUrl( URL::Null )
+{
+}
+
+HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( )
+{
+}
+
+void HTMLLinkExtractProcessor::process( RewindInputStream *s )
+{
+ string link;
+ char buf[1] = {0};
+ bool in_link = false;
+
+ m_baseUrl = s->getBaseUrl( );
+
+ while( s->good( ) && !s->eof( ) ) {
+ buf[0] = s->get( );
+ m_parser.Parse( buf, 1 );
+
+ if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) {
+ if( strcmp( m_parser.tag( ), "base" ) == 0 &&
+ strcmp( m_parser.attribute( ), "href" ) == 0 ) {
+ m_baseUrl = m_normalizer->parseUrl( m_parser.value( ) );
+ }
+ if( ( ( strcmp( m_parser.tag( ), "a" ) == 0 ||
+ strcmp( m_parser.tag( ), "area" ) == 0 ||
+ strcmp( m_parser.tag( ), "link" ) == 0 ) &&
+ strcmp( m_parser.attribute( ), "href" ) == 0 ) ||
+ ( ( strcmp( m_parser.tag( ), "img" ) == 0 ||
+ strcmp( m_parser.tag( ), "frame" ) == 0 ||
+ strcmp( m_parser.tag( ), "iframe" ) == 0 ||
+ strcmp( m_parser.tag( ), "embed" ) == 0 ) &&
+ strcmp( m_parser.attribute( ), "src" ) == 0 )
+ ) {
+ link = m_parser.value( );
+ in_link = true;
+ }
+ } else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) {
+ URL absoluteLink = m_normalizer->normalize( m_baseUrl, link );
+ if( m_filter->filter( absoluteLink ) ) {
+ if( !m_urlSeen->seen( absoluteLink ) ) {
+ m_frontier->addUrl( absoluteLink );
+ }
+ }
+
+ link.clear( );
+ in_link = false;
+ } else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) {
+ // TODO: proper error handling
+ cerr << endl << "ERROR at " << endl;
+ m_parser.Reset( );
+ return;
+ }
+ }
+
+ m_parser.Reset( );
+}
+
+REGISTER_MODULE_4( "htmllinkextract_processor", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
diff --git a/src/modules/processor/sitemap/SitemapProcessor.hpp b/src/modules/processor/sitemap/SitemapProcessor.hpp
new file mode 100644
index 0000000..d5c6bc6
--- /dev/null
+++ b/src/modules/processor/sitemap/SitemapProcessor.hpp
@@ -0,0 +1,30 @@
+#ifndef __HTML_LINK_EXTRACT_PROCESSOR_H
+#define __HTML_LINK_EXTRACT_PROCESSOR_H
+
+#include "Processor.hpp"
+#include "URLNormalizer.hpp"
+#include "Frontier.hpp"
+#include "URLFilter.hpp"
+#include "URLSeen.hpp"
+#include "ModuleRegistry.hpp"
+
+#include "htmlparser_cpp.h"
+
+class HTMLLinkExtractProcessor : public Processor {
+ public:
+ HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen );
+ virtual ~HTMLLinkExtractProcessor( );
+ virtual void process( RewindInputStream *s );
+
+ protected:
+ URLNormalizer *m_normalizer;
+ Frontier *m_frontier;
+ URLFilter *m_filter;
+ URLSeen *m_urlSeen;
+ streamhtmlparser::HtmlParser m_parser;
+ URL m_baseUrl;
+};
+
+DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
+
+#endif