diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-07-24 14:29:17 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-07-24 14:29:17 +0200 |
commit | 6e531b500b87c633f82fee085e0240c752cc8af0 (patch) | |
tree | 89e9214f550b24a9afa42e581b955e47ef501432 /src | |
parent | 53b6b08956e45975c95401b5e3db0bc6a65e4c59 (diff) | |
download | crawler-6e531b500b87c633f82fee085e0240c752cc8af0.tar.gz crawler-6e531b500b87c633f82fee085e0240c752cc8af0.tar.bz2 |
reading complete sitemap indexes and sitemaps
Diffstat (limited to 'src')
-rwxr-xr-x | src/crawl/crawl.cpp | 5 | ||||
-rw-r--r-- | src/modules/processor/sitemap/GNUmakefile | 3 | ||||
-rw-r--r-- | src/modules/processor/sitemap/SitemapProcessor.cpp | 74 |
3 files changed, 75 insertions, 7 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index ae610a2..c5b893d 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -17,6 +17,7 @@ #ifndef _WIN32 #include <signal.h> +#include <unistd.h> #else #define WIN32_MEAN_AND_LEAN #endif @@ -231,10 +232,12 @@ int main( void ) sitemapParser->process( s ); } } + + sleep( 2 ); #else htmlParser->process( s ); #endif - + delete s; } diff --git a/src/modules/processor/sitemap/GNUmakefile b/src/modules/processor/sitemap/GNUmakefile index f206b35..d591999 100644 --- a/src/modules/processor/sitemap/GNUmakefile +++ b/src/modules/processor/sitemap/GNUmakefile @@ -9,7 +9,8 @@ INCLUDE_DIRS = \ -I$(TOPDIR)/include/logger \ -I$(TOPDIR)/include/util \ -I$(TOPDIR)/include/module \ - -I$(TOPDIR)/include/crawler + -I$(TOPDIR)/include/crawler \ + -I$(TOPDIR)/textwolf/include INCLUDE_CXXFLAGS = \ diff --git a/src/modules/processor/sitemap/SitemapProcessor.cpp b/src/modules/processor/sitemap/SitemapProcessor.cpp index e168beb..825010d 100644 --- a/src/modules/processor/sitemap/SitemapProcessor.cpp +++ b/src/modules/processor/sitemap/SitemapProcessor.cpp @@ -1,10 +1,14 @@ #include "SitemapProcessor.hpp" #include "Logger.hpp" +#include "textwolf.hpp" +#include "textwolf/istreamiterator.hpp" + #include <string> #include <cstring> using namespace std; +using namespace textwolf; SitemapProcessor::SitemapProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ) : m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ) @@ -17,14 +21,74 @@ SitemapProcessor::~SitemapProcessor( ) void SitemapProcessor::process( RewindInputStream *s ) { - string line; + IStreamIterator isitr( *s ); - while( s->good( ) && !s->eof( ) ) { - getline( *s, line ); - if( s->good( ) ) { - cout << line << endl; + typedef enum { + SITEMAPINDEX_XMLNS = 1, + SITEMAPINDEX_SITEMAP_LOC = 2, + URLSET_URL_LOC = 3 + } XmlNodes; + typedef XMLPathSelectAutomaton<charset::UTF8> Automaton; + Automaton atm; + // /sitemapindex@xmlns + (*atm)["sitemapindex"]("xmlns") = SITEMAPINDEX_XMLNS; + // /sitemapindex/sitemap/loc::text + (*atm)["sitemapindex"]["sitemap"]["loc"]( ) = SITEMAPINDEX_SITEMAP_LOC; + // /urlset/url/loc::text + (*atm)["urlset"]["url"]["loc"]( ) = URLSET_URL_LOC; + + typedef XMLPathSelect<charset::UTF8> PathSelect; + typedef XMLScanner<IStreamIterator, charset::UTF8, charset::UTF8, std::string> Scanner; + Scanner xsc( isitr ); + PathSelect xsel( &atm ); + + Scanner::iterator ci, ce; + for( ci = xsc.begin( ), ce = xsc.end( ); ci != ce; ci++ ) { + if( ci->type( ) == Scanner::ErrorOccurred ) { + throw std::runtime_error( ci->content( ) ); + } + + PathSelect::iterator itr = xsel.push( ci->type( ), ci->content( ), ci->size( ) ); + PathSelect::iterator end = xsel.end( ); + for( ; itr != end; itr++ ) { + switch( *itr ) { + case SITEMAPINDEX_XMLNS: { + string schema = ci->content( ); + string expectedSchema( "http://www.sitemaps.org/schemas/sitemap/0.9" ); + if( schema.compare( expectedSchema ) != 0 ) { + LOG( logERROR ) << "Sitemap XML has invalid doctype '" << schema << "' (expected '" + << expectedSchema << "')"; + return; + } + break; + } + + case SITEMAPINDEX_SITEMAP_LOC: { + string sitemap = ci->content( ); + LOG( logINFO ) << "Found sitemap '" << sitemap << "'"; + URL sitemapLink = m_normalizer->parseUrl( sitemap ); + if( !m_urlSeen->seen( sitemapLink ) ) { + m_frontier->addUrl( sitemapLink ); + } + break; + } + + case URLSET_URL_LOC: { + string urlString = ci->content( ); + LOG( logINFO ) << "Found URL in sitemap '" << urlString << "'"; + URL url = m_normalizer->parseUrl( urlString ); + if( !m_urlSeen->seen( url ) ) { + m_frontier->addUrl( url ); + } + break; + } + + default: + LOG( logERROR ) << "Unexpected value in XML parser: " << *itr; + } } } + } REGISTER_MODULE_4( "sitemap_processor", Processor, SitemapProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) |