summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2014-07-24 13:26:48 +0200
committerAndreas Baumann <abaumann@yahoo.com>2014-07-24 13:26:48 +0200
commit53b6b08956e45975c95401b5e3db0bc6a65e4c59 (patch)
treed48dfa92b113db61d7dcca81e764198bd6902129 /src
parentaa7fa3ef71b4e9193088b67c9b34448c00a8f949 (diff)
downloadcrawler-53b6b08956e45975c95401b5e3db0bc6a65e4c59.tar.gz
crawler-53b6b08956e45975c95401b5e3db0bc6a65e4c59.tar.bz2
..
Diffstat (limited to 'src')
-rwxr-xr-xsrc/crawl/crawl.cpp2
-rw-r--r--src/modules/processor/robotstxt/RobotsTxtProcessor.cpp2
-rw-r--r--src/modules/processor/sitemap/SitemapProcessor.cpp63
-rw-r--r--src/modules/processor/sitemap/SitemapProcessor.hpp14
4 files changed, 17 insertions, 64 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index 4899d0f..ae610a2 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -226,7 +226,7 @@ int main( void )
s->rewind( );
robotsTxtParser->process( s );
}
- } else if( mimeType == "text/xml" ) {
+ } else if( mimeType == "application/xml" ) {
s->rewind( );
sitemapParser->process( s );
}
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
index ec5bf20..7c28544 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
@@ -76,8 +76,6 @@ void RobotsTxtProcessor::process( RewindInputStream *s )
{
string line;
- URL url = s->getBaseUrl( );
-
while( s->good( ) && !s->eof( ) ) {
getline( *s, line );
if( s->good( ) ) {
diff --git a/src/modules/processor/sitemap/SitemapProcessor.cpp b/src/modules/processor/sitemap/SitemapProcessor.cpp
index f575a0c..e168beb 100644
--- a/src/modules/processor/sitemap/SitemapProcessor.cpp
+++ b/src/modules/processor/sitemap/SitemapProcessor.cpp
@@ -1,71 +1,30 @@
-#include "HTMLLinkExtractProcessor.hpp"
+#include "SitemapProcessor.hpp"
#include "Logger.hpp"
#include <string>
#include <cstring>
using namespace std;
-using namespace streamhtmlparser;
-HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen )
- : m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ),
- m_parser( ), m_baseUrl( URL::Null )
+SitemapProcessor::SitemapProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen )
+ : m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen )
{
}
-HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( )
+SitemapProcessor::~SitemapProcessor( )
{
}
-void HTMLLinkExtractProcessor::process( RewindInputStream *s )
+void SitemapProcessor::process( RewindInputStream *s )
{
- string link;
- char buf[1] = {0};
- bool in_link = false;
-
- m_baseUrl = s->getBaseUrl( );
-
+ string line;
+
while( s->good( ) && !s->eof( ) ) {
- buf[0] = s->get( );
- m_parser.Parse( buf, 1 );
-
- if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) {
- if( strcmp( m_parser.tag( ), "base" ) == 0 &&
- strcmp( m_parser.attribute( ), "href" ) == 0 ) {
- m_baseUrl = m_normalizer->parseUrl( m_parser.value( ) );
- }
- if( ( ( strcmp( m_parser.tag( ), "a" ) == 0 ||
- strcmp( m_parser.tag( ), "area" ) == 0 ||
- strcmp( m_parser.tag( ), "link" ) == 0 ) &&
- strcmp( m_parser.attribute( ), "href" ) == 0 ) ||
- ( ( strcmp( m_parser.tag( ), "img" ) == 0 ||
- strcmp( m_parser.tag( ), "frame" ) == 0 ||
- strcmp( m_parser.tag( ), "iframe" ) == 0 ||
- strcmp( m_parser.tag( ), "embed" ) == 0 ) &&
- strcmp( m_parser.attribute( ), "src" ) == 0 )
- ) {
- link = m_parser.value( );
- in_link = true;
- }
- } else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) {
- URL absoluteLink = m_normalizer->normalize( m_baseUrl, link );
- if( m_filter->filter( absoluteLink ) ) {
- if( !m_urlSeen->seen( absoluteLink ) ) {
- m_frontier->addUrl( absoluteLink );
- }
- }
-
- link.clear( );
- in_link = false;
- } else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) {
- // TODO: proper error handling
- cerr << endl << "ERROR at " << endl;
- m_parser.Reset( );
- return;
+ getline( *s, line );
+ if( s->good( ) ) {
+ cout << line << endl;
}
}
-
- m_parser.Reset( );
}
-REGISTER_MODULE_4( "htmllinkextract_processor", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
+REGISTER_MODULE_4( "sitemap_processor", Processor, SitemapProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
diff --git a/src/modules/processor/sitemap/SitemapProcessor.hpp b/src/modules/processor/sitemap/SitemapProcessor.hpp
index d5c6bc6..7d317cf 100644
--- a/src/modules/processor/sitemap/SitemapProcessor.hpp
+++ b/src/modules/processor/sitemap/SitemapProcessor.hpp
@@ -1,5 +1,5 @@
-#ifndef __HTML_LINK_EXTRACT_PROCESSOR_H
-#define __HTML_LINK_EXTRACT_PROCESSOR_H
+#ifndef __SITEMAP_PROCESSOR_H
+#define __SITEMAP_PROCESSOR_H
#include "Processor.hpp"
#include "URLNormalizer.hpp"
@@ -8,12 +8,10 @@
#include "URLSeen.hpp"
#include "ModuleRegistry.hpp"
-#include "htmlparser_cpp.h"
-
-class HTMLLinkExtractProcessor : public Processor {
+class SitemapProcessor : public Processor {
public:
- HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen );
- virtual ~HTMLLinkExtractProcessor( );
+ SitemapProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen );
+ virtual ~SitemapProcessor( );
virtual void process( RewindInputStream *s );
protected:
@@ -21,8 +19,6 @@ class HTMLLinkExtractProcessor : public Processor {
Frontier *m_frontier;
URLFilter *m_filter;
URLSeen *m_urlSeen;
- streamhtmlparser::HtmlParser m_parser;
- URL m_baseUrl;
};
DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )