diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2014-07-24 13:26:48 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2014-07-24 13:26:48 +0200 |
commit | 53b6b08956e45975c95401b5e3db0bc6a65e4c59 (patch) | |
tree | d48dfa92b113db61d7dcca81e764198bd6902129 /src | |
parent | aa7fa3ef71b4e9193088b67c9b34448c00a8f949 (diff) | |
download | crawler-53b6b08956e45975c95401b5e3db0bc6a65e4c59.tar.gz crawler-53b6b08956e45975c95401b5e3db0bc6a65e4c59.tar.bz2 |
..
Diffstat (limited to 'src')
-rwxr-xr-x | src/crawl/crawl.cpp | 2 | ||||
-rw-r--r-- | src/modules/processor/robotstxt/RobotsTxtProcessor.cpp | 2 | ||||
-rw-r--r-- | src/modules/processor/sitemap/SitemapProcessor.cpp | 63 | ||||
-rw-r--r-- | src/modules/processor/sitemap/SitemapProcessor.hpp | 14 |
4 files changed, 17 insertions, 64 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index 4899d0f..ae610a2 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -226,7 +226,7 @@ int main( void ) s->rewind( ); robotsTxtParser->process( s ); } - } else if( mimeType == "text/xml" ) { + } else if( mimeType == "application/xml" ) { s->rewind( ); sitemapParser->process( s ); } diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp index ec5bf20..7c28544 100644 --- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp +++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp @@ -76,8 +76,6 @@ void RobotsTxtProcessor::process( RewindInputStream *s ) { string line; - URL url = s->getBaseUrl( ); - while( s->good( ) && !s->eof( ) ) { getline( *s, line ); if( s->good( ) ) { diff --git a/src/modules/processor/sitemap/SitemapProcessor.cpp b/src/modules/processor/sitemap/SitemapProcessor.cpp index f575a0c..e168beb 100644 --- a/src/modules/processor/sitemap/SitemapProcessor.cpp +++ b/src/modules/processor/sitemap/SitemapProcessor.cpp @@ -1,71 +1,30 @@ -#include "HTMLLinkExtractProcessor.hpp" +#include "SitemapProcessor.hpp" #include "Logger.hpp" #include <string> #include <cstring> using namespace std; -using namespace streamhtmlparser; -HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ) - : m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ), - m_parser( ), m_baseUrl( URL::Null ) +SitemapProcessor::SitemapProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ) + : m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ) { } -HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( ) +SitemapProcessor::~SitemapProcessor( ) { } -void HTMLLinkExtractProcessor::process( RewindInputStream *s ) +void SitemapProcessor::process( RewindInputStream *s ) { - string link; - char buf[1] = {0}; - bool in_link = false; - - m_baseUrl = s->getBaseUrl( ); - + string line; + while( s->good( ) && !s->eof( ) ) { - buf[0] = s->get( ); - m_parser.Parse( buf, 1 ); - - if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) { - if( strcmp( m_parser.tag( ), "base" ) == 0 && - strcmp( m_parser.attribute( ), "href" ) == 0 ) { - m_baseUrl = m_normalizer->parseUrl( m_parser.value( ) ); - } - if( ( ( strcmp( m_parser.tag( ), "a" ) == 0 || - strcmp( m_parser.tag( ), "area" ) == 0 || - strcmp( m_parser.tag( ), "link" ) == 0 ) && - strcmp( m_parser.attribute( ), "href" ) == 0 ) || - ( ( strcmp( m_parser.tag( ), "img" ) == 0 || - strcmp( m_parser.tag( ), "frame" ) == 0 || - strcmp( m_parser.tag( ), "iframe" ) == 0 || - strcmp( m_parser.tag( ), "embed" ) == 0 ) && - strcmp( m_parser.attribute( ), "src" ) == 0 ) - ) { - link = m_parser.value( ); - in_link = true; - } - } else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) { - URL absoluteLink = m_normalizer->normalize( m_baseUrl, link ); - if( m_filter->filter( absoluteLink ) ) { - if( !m_urlSeen->seen( absoluteLink ) ) { - m_frontier->addUrl( absoluteLink ); - } - } - - link.clear( ); - in_link = false; - } else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) { - // TODO: proper error handling - cerr << endl << "ERROR at " << endl; - m_parser.Reset( ); - return; + getline( *s, line ); + if( s->good( ) ) { + cout << line << endl; } } - - m_parser.Reset( ); } -REGISTER_MODULE_4( "htmllinkextract_processor", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) +REGISTER_MODULE_4( "sitemap_processor", Processor, SitemapProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) diff --git a/src/modules/processor/sitemap/SitemapProcessor.hpp b/src/modules/processor/sitemap/SitemapProcessor.hpp index d5c6bc6..7d317cf 100644 --- a/src/modules/processor/sitemap/SitemapProcessor.hpp +++ b/src/modules/processor/sitemap/SitemapProcessor.hpp @@ -1,5 +1,5 @@ -#ifndef __HTML_LINK_EXTRACT_PROCESSOR_H -#define __HTML_LINK_EXTRACT_PROCESSOR_H +#ifndef __SITEMAP_PROCESSOR_H +#define __SITEMAP_PROCESSOR_H #include "Processor.hpp" #include "URLNormalizer.hpp" @@ -8,12 +8,10 @@ #include "URLSeen.hpp" #include "ModuleRegistry.hpp" -#include "htmlparser_cpp.h" - -class HTMLLinkExtractProcessor : public Processor { +class SitemapProcessor : public Processor { public: - HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ); - virtual ~HTMLLinkExtractProcessor( ); + SitemapProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ); + virtual ~SitemapProcessor( ); virtual void process( RewindInputStream *s ); protected: @@ -21,8 +19,6 @@ class HTMLLinkExtractProcessor : public Processor { Frontier *m_frontier; URLFilter *m_filter; URLSeen *m_urlSeen; - streamhtmlparser::HtmlParser m_parser; - URL m_baseUrl; }; DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) |