..

author: Andreas Baumann <abaumann@yahoo.com> 2014-07-24 13:26:48 +0200
committer: Andreas Baumann <abaumann@yahoo.com> 2014-07-24 13:26:48 +0200
commit: 53b6b08956e45975c95401b5e3db0bc6a65e4c59 (patch)
tree: d48dfa92b113db61d7dcca81e764198bd6902129 /src
parent: aa7fa3ef71b4e9193088b67c9b34448c00a8f949 (diff)
download: crawler-53b6b08956e45975c95401b5e3db0bc6a65e4c59.tar.gz
crawler-53b6b08956e45975c95401b5e3db0bc6a65e4c59.tar.bz2
4 files changed, 17 insertions, 64 deletions
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index 4899d0f..ae610a2 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -226,7 +226,7 @@ int main( void )
 						s->rewind( );
 						robotsTxtParser->process( s );
 					}
-				} else if( mimeType == "text/xml" ) {
+				} else if( mimeType == "application/xml" ) {
 					s->rewind( );
 					sitemapParser->process( s );
 				}
diff --git a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
index ec5bf20..7c28544 100644
--- a/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
+++ b/src/modules/processor/robotstxt/RobotsTxtProcessor.cpp
@@ -76,8 +76,6 @@ void RobotsTxtProcessor::process( RewindInputStream *s )
 {
 	string line;
 
-	URL url = s->getBaseUrl( );
-	
 	while( s->good( ) && !s->eof( ) ) {
 		getline( *s, line );
 		if( s->good( ) ) {
diff --git a/src/modules/processor/sitemap/SitemapProcessor.cpp b/src/modules/processor/sitemap/SitemapProcessor.cpp
index f575a0c..e168beb 100644
--- a/src/modules/processor/sitemap/SitemapProcessor.cpp
+++ b/src/modules/processor/sitemap/SitemapProcessor.cpp
@@ -1,71 +1,30 @@
-#include "HTMLLinkExtractProcessor.hpp"
+#include "SitemapProcessor.hpp"
 #include "Logger.hpp"
 
 #include <string>
 #include <cstring>
 
 using namespace std;
-using namespace streamhtmlparser;
 
-HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen )
-	: m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ),
-	m_parser( ), m_baseUrl( URL::Null )
+SitemapProcessor::SitemapProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen )
+	: m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen )
 {
 }
 
-HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( )
+SitemapProcessor::~SitemapProcessor( )
 {
 }
 
-void HTMLLinkExtractProcessor::process( RewindInputStream *s )
+void SitemapProcessor::process( RewindInputStream *s )
 {
-	string link;
-	char buf[1] = {0};
-	bool in_link = false;
-	
-	m_baseUrl = s->getBaseUrl( );
-	
+	string line;
+
 	while( s->good( ) && !s->eof( ) ) {
-		buf[0] = s->get( );
-		m_parser.Parse( buf, 1 );
-				
-		if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) {
-			if( 	strcmp( m_parser.tag( ), "base" ) == 0 &&
-				strcmp( m_parser.attribute( ), "href" ) == 0 ) {
-				m_baseUrl = m_normalizer->parseUrl( m_parser.value( ) );
-			}
-			if( 	( ( 	strcmp( m_parser.tag( ), "a" ) == 0 ||
-					strcmp( m_parser.tag( ), "area" ) == 0 ||
-					strcmp( m_parser.tag( ), "link" ) == 0 ) &&
-					strcmp( m_parser.attribute( ), "href" ) == 0 ) || 
-				( (	strcmp( m_parser.tag( ), "img" ) == 0 ||
-					strcmp( m_parser.tag( ), "frame" ) == 0 ||
-					strcmp( m_parser.tag( ), "iframe" ) == 0 ||
-					strcmp( m_parser.tag( ), "embed" ) == 0 ) &&
-					strcmp( m_parser.attribute( ), "src" ) == 0 )
-				) {				
-				link = m_parser.value( );
-				in_link = true;
-			}
-		} else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) {
-			URL absoluteLink = m_normalizer->normalize( m_baseUrl, link );
-			if( m_filter->filter( absoluteLink ) ) {
-				if( !m_urlSeen->seen( absoluteLink ) ) {
-					m_frontier->addUrl( absoluteLink );
-				}
-			}
-			
-			link.clear( );
-			in_link = false;
-		} else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) {
-			// TODO: proper error handling
-			cerr << endl << "ERROR at " << endl;
-			m_parser.Reset( );
-			return;
+		getline( *s, line );
+		if( s->good( ) ) {
+			cout << line << endl;
 		}
 	}
-
-	m_parser.Reset( );
 }
 
-REGISTER_MODULE_4( "htmllinkextract_processor", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
+REGISTER_MODULE_4( "sitemap_processor", Processor, SitemapProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
diff --git a/src/modules/processor/sitemap/SitemapProcessor.hpp b/src/modules/processor/sitemap/SitemapProcessor.hpp
index d5c6bc6..7d317cf 100644
--- a/src/modules/processor/sitemap/SitemapProcessor.hpp
+++ b/src/modules/processor/sitemap/SitemapProcessor.hpp
@@ -1,5 +1,5 @@
-#ifndef __HTML_LINK_EXTRACT_PROCESSOR_H
-#define __HTML_LINK_EXTRACT_PROCESSOR_H
+#ifndef __SITEMAP_PROCESSOR_H
+#define __SITEMAP_PROCESSOR_H
 
 #include "Processor.hpp"
 #include "URLNormalizer.hpp"
@@ -8,12 +8,10 @@
 #include "URLSeen.hpp"
 #include "ModuleRegistry.hpp"
 
-#include "htmlparser_cpp.h"
-
-class HTMLLinkExtractProcessor : public Processor {
+class SitemapProcessor : public Processor {
 	public:
-		HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen );
-		virtual ~HTMLLinkExtractProcessor( );
+		SitemapProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen );
+		virtual ~SitemapProcessor( );
 		virtual void process( RewindInputStream *s );
 	
 	protected:
@@ -21,8 +19,6 @@ class HTMLLinkExtractProcessor : public Processor {
 		Frontier *m_frontier;
 		URLFilter *m_filter;
 		URLSeen *m_urlSeen;
-		streamhtmlparser::HtmlParser m_parser;
-		URL m_baseUrl;
 };
 
 DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
author	Andreas Baumann <abaumann@yahoo.com>	2014-07-24 13:26:48 +0200
committer	Andreas Baumann <abaumann@yahoo.com>	2014-07-24 13:26:48 +0200
commit	53b6b08956e45975c95401b5e3db0bc6a65e4c59 (patch)
tree	d48dfa92b113db61d7dcca81e764198bd6902129 /src
parent	aa7fa3ef71b4e9193088b67c9b34448c00a8f949 (diff)
download	crawler-53b6b08956e45975c95401b5e3db0bc6a65e4c59.tar.gz crawler-53b6b08956e45975c95401b5e3db0bc6a65e4c59.tar.bz2