diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-07-24 13:12:37 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-07-24 13:12:37 +0200 |
commit | aa7fa3ef71b4e9193088b67c9b34448c00a8f949 (patch) | |
tree | 47c88c4189f025a809a61a906bd636e5a05372ed /src | |
parent | a57788acee59705418b96525410b84fbee2f405a (diff) | |
download | crawler-aa7fa3ef71b4e9193088b67c9b34448c00a8f949.tar.gz crawler-aa7fa3ef71b4e9193088b67c9b34448c00a8f949.tar.bz2 |
sitemap processing (work in progress)
Diffstat (limited to 'src')
-rwxr-xr-x | src/crawl/crawl.cpp | 16 | ||||
-rw-r--r-- | src/modules/processor/GNUmakefile | 2 | ||||
-rwxr-xr-x | src/modules/processor/Makefile.W32 | 2 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.cpp | 11 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.hpp | 14 | ||||
-rw-r--r-- | src/modules/processor/sitemap/GNUmakefile | 44 | ||||
-rwxr-xr-x | src/modules/processor/sitemap/Makefile.W32 | 52 | ||||
-rw-r--r-- | src/modules/processor/sitemap/SitemapProcessor.cpp | 71 | ||||
-rw-r--r-- | src/modules/processor/sitemap/SitemapProcessor.hpp | 30 |
9 files changed, 232 insertions, 10 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index ecc8f16..4899d0f 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -55,7 +55,8 @@ BOOL WINAPI termHandler( DWORD ctrlType ) int main( void ) { try { - Logger::instance( ).openConsoleLog( logINFO ); +// Logger::instance( ).openConsoleLog( logINFO ); + Logger::instance( ).openConsoleLog( logDEBUG ); #ifndef _WIN32 struct sigaction sa; @@ -135,9 +136,11 @@ int main( void ) #ifndef _WIN32 processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); processorModules.push_back( "./modules/processor/robotstxt/mod_processor_robotstxt.so" ); + processorModules.push_back( "./modules/processor/sitemap/mod_processor_sitemap.so" ); #else processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); processorModules.push_back( ".\\modules\\processor\\robotstxt\\mod_processor_robotstxt.dll" ); + processorModules.push_back( ".\\modules\\processor\\sitemap\\mod_processor_sitemap.dll" ); #endif ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules ); @@ -179,9 +182,12 @@ int main( void ) Processor *htmlParser = processors.create( "htmllinkextract_processor", normalizer, frontier, chainFilter, urlSeen ); - + Processor *robotsTxtParser = processors.create( "robotstxt_processor", normalizer, frontier, chainFilter, urlSeen ); + + Processor *sitemapParser = processors.create( "sitemap_processor", + normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; @@ -206,7 +212,8 @@ int main( void ) #ifndef _WIN32 MIMEType mimeType = typeDetect->detect( s ); - if( mimeType != MIMEType::Null ) { + if( mimeType != MIMEType::Null ) { + LOG( logDEBUG ) << "MIME type of '" << url << "' is '" << mimeType << "'"; if( mimeType == "text/html" ) { s->rewind( ); htmlParser->process( s ); @@ -219,6 +226,9 @@ int main( void ) s->rewind( ); robotsTxtParser->process( s ); } + } else if( mimeType == "text/xml" ) { + s->rewind( ); + sitemapParser->process( s ); } } #else diff --git a/src/modules/processor/GNUmakefile b/src/modules/processor/GNUmakefile index 8b91967..48a733a 100644 --- a/src/modules/processor/GNUmakefile +++ b/src/modules/processor/GNUmakefile @@ -1,6 +1,6 @@ TOPDIR = ../../.. -SUBDIRS = htmllinkextract robotstxt +SUBDIRS = htmllinkextract robotstxt sitemap -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/modules/processor/Makefile.W32 b/src/modules/processor/Makefile.W32 index 530fd98..0888560 100755 --- a/src/modules/processor/Makefile.W32 +++ b/src/modules/processor/Makefile.W32 @@ -1,6 +1,6 @@ TOPDIR = ..\..\.. -SUBDIRS = htmllinkextract robotstxt +SUBDIRS = htmllinkextract robotstxt sitemap !INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp index 7a91465..ec5bf20 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp @@ -8,7 +8,8 @@ using namespace std; -RobotsTxtProcessor::RobotsTxtProcessor( ) +RobotsTxtProcessor::RobotsTxtProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *urlFilter, URLSeen *urlSeen ) + : m_normalizer( normalizer ), m_frontier( frontier ), m_urlFilter( urlFilter ), m_urlSeen( urlSeen ) { } @@ -53,7 +54,11 @@ void RobotsTxtProcessor::handleLine( const string &line ) case SitemapKeyword: { skipSpaces( s, line.end( ) ); string sitemap = string( s, line.end( ) ); - LOG( logINFO ) << "Found Sitemap '" << sitemap << "'"; + LOG( logINFO ) << "Found sitemap '" << sitemap << "'"; + URL sitemapLink = m_normalizer->parseUrl( sitemap ); + if( !m_urlSeen->seen( sitemapLink ) ) { + m_frontier->addUrl( sitemapLink ); + } } break; @@ -81,4 +86,4 @@ void RobotsTxtProcessor::process( RewindInputStream *s ) } } -REGISTER_MODULE( "robotstxt_processor", Processor, RobotsTxtProcessor ) +REGISTER_MODULE_4( "robotstxt_processor", Processor, RobotsTxtProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp index 532c741..7db6302 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.hpp @@ -3,10 +3,14 @@ #include "Processor.hpp" #include "ModuleRegistry.hpp" +#include "URLNormalizer.hpp" +#include "Frontier.hpp" +#include "URLFilter.hpp" +#include "URLSeen.hpp" class RobotsTxtProcessor : public Processor { public: - RobotsTxtProcessor( ); + RobotsTxtProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *urlFilter, URLSeen *urlSeen ); virtual ~RobotsTxtProcessor( ); virtual void process( RewindInputStream *s ); @@ -19,12 +23,18 @@ class RobotsTxtProcessor : public Processor { NoKeyword } KeywordType; + protected: + URLNormalizer *m_normalizer; + Frontier *m_frontier; + URLFilter *m_urlFilter; + URLSeen *m_urlSeen; + private: KeywordType getKeyword( string::const_iterator &it, string::const_iterator end ); void skipSpaces( string::const_iterator &it, string::const_iterator end ); void handleLine( const std::string &s ); }; -DECLARE_MODULE( Processor ) +DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) #endif diff --git a/src/modules/processor/sitemap/GNUmakefile b/src/modules/processor/sitemap/GNUmakefile new file mode 100644 index 0000000..f206b35 --- /dev/null +++ b/src/modules/processor/sitemap/GNUmakefile @@ -0,0 +1,44 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src \ + -I$(TOPDIR)/include/logger \ + -I$(TOPDIR)/include/util \ + -I$(TOPDIR)/include/module \ + -I$(TOPDIR)/include/crawler + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + -L$(TOPDIR)/src/libcrawler + +INCLUDE_LIBS = \ + -lcrawler + +DYNAMIC_MODULE = \ + mod_processor_sitemap.so + +STATIC_LIB = \ + libsitemapprocessor.a + +CPP_OBJS = \ + SitemapProcessor.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/modules/processor/sitemap/Makefile.W32 b/src/modules/processor/sitemap/Makefile.W32 new file mode 100755 index 0000000..115b4c8 --- /dev/null +++ b/src/modules/processor/sitemap/Makefile.W32 @@ -0,0 +1,52 @@ +TOPDIR = ..\..\..\.. + +SUBDIRS = + +!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk + +INCLUDE_CXXFLAGS = \ + /D_WIN32_WINNT=0x504 + +INCLUDE_DIRS = \ + /I. \ + /I$(TOPDIR)\src \ + /I$(TOPDIR)\include\module \ + /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\crawler + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)\src\libcrawler\crawler.lib \ + $(TOPDIR)\src\liblogger\logger.lib + +DYNAMIC_MODULE = \ + mod_processor_sitemap.dll + +STATIC_LIB = \ + sitemapprocessor.lib + +CPP_OBJS = \ + SitemapProcessor.obj + +SHARED_CPP_OBJS = \ + HTMLLinkExtractProcessor.dllobj + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +$(STATIC_LIB): $(CPP_OBJS) + $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $? + +$(DYNAMIC_MODULE): $(SHARED_CPP_OBJS) + $(LINK) /dll /nologo /out:$@ $(LDFLAGS) $(LIBS) $? + +local_all: $(STATIC_LIB) $(DYNAMIC_MODULE) + +local_clean: + @-erase $(LOCAL_STATIC_LIB) 2>NUL + @-erase $(CPP_OBJS) 2>NUL + +local_distclean: + +local_test: diff --git a/src/modules/processor/sitemap/SitemapProcessor.cpp b/src/modules/processor/sitemap/SitemapProcessor.cpp new file mode 100644 index 0000000..f575a0c --- /dev/null +++ b/src/modules/processor/sitemap/SitemapProcessor.cpp @@ -0,0 +1,71 @@ +#include "HTMLLinkExtractProcessor.hpp" +#include "Logger.hpp" + +#include <string> +#include <cstring> + +using namespace std; +using namespace streamhtmlparser; + +HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ) + : m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ), + m_parser( ), m_baseUrl( URL::Null ) +{ +} + +HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( ) +{ +} + +void HTMLLinkExtractProcessor::process( RewindInputStream *s ) +{ + string link; + char buf[1] = {0}; + bool in_link = false; + + m_baseUrl = s->getBaseUrl( ); + + while( s->good( ) && !s->eof( ) ) { + buf[0] = s->get( ); + m_parser.Parse( buf, 1 ); + + if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) { + if( strcmp( m_parser.tag( ), "base" ) == 0 && + strcmp( m_parser.attribute( ), "href" ) == 0 ) { + m_baseUrl = m_normalizer->parseUrl( m_parser.value( ) ); + } + if( ( ( strcmp( m_parser.tag( ), "a" ) == 0 || + strcmp( m_parser.tag( ), "area" ) == 0 || + strcmp( m_parser.tag( ), "link" ) == 0 ) && + strcmp( m_parser.attribute( ), "href" ) == 0 ) || + ( ( strcmp( m_parser.tag( ), "img" ) == 0 || + strcmp( m_parser.tag( ), "frame" ) == 0 || + strcmp( m_parser.tag( ), "iframe" ) == 0 || + strcmp( m_parser.tag( ), "embed" ) == 0 ) && + strcmp( m_parser.attribute( ), "src" ) == 0 ) + ) { + link = m_parser.value( ); + in_link = true; + } + } else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) { + URL absoluteLink = m_normalizer->normalize( m_baseUrl, link ); + if( m_filter->filter( absoluteLink ) ) { + if( !m_urlSeen->seen( absoluteLink ) ) { + m_frontier->addUrl( absoluteLink ); + } + } + + link.clear( ); + in_link = false; + } else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) { + // TODO: proper error handling + cerr << endl << "ERROR at " << endl; + m_parser.Reset( ); + return; + } + } + + m_parser.Reset( ); +} + +REGISTER_MODULE_4( "htmllinkextract_processor", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) diff --git a/src/modules/processor/sitemap/SitemapProcessor.hpp b/src/modules/processor/sitemap/SitemapProcessor.hpp new file mode 100644 index 0000000..d5c6bc6 --- /dev/null +++ b/src/modules/processor/sitemap/SitemapProcessor.hpp @@ -0,0 +1,30 @@ +#ifndef __HTML_LINK_EXTRACT_PROCESSOR_H +#define __HTML_LINK_EXTRACT_PROCESSOR_H + +#include "Processor.hpp" +#include "URLNormalizer.hpp" +#include "Frontier.hpp" +#include "URLFilter.hpp" +#include "URLSeen.hpp" +#include "ModuleRegistry.hpp" + +#include "htmlparser_cpp.h" + +class HTMLLinkExtractProcessor : public Processor { + public: + HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ); + virtual ~HTMLLinkExtractProcessor( ); + virtual void process( RewindInputStream *s ); + + protected: + URLNormalizer *m_normalizer; + Frontier *m_frontier; + URLFilter *m_filter; + URLSeen *m_urlSeen; + streamhtmlparser::HtmlParser m_parser; + URL m_baseUrl; +}; + +DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) + +#endif |