diff options
-rw-r--r-- | README.3rdPARTY | 3 | ||||
-rw-r--r-- | src/GNUmakefile | 6 | ||||
-rw-r--r-- | src/HTMLLinkExtractProcessor.cpp | 35 | ||||
-rw-r--r-- | src/HTMLLinkExtractProcessor.hpp | 3 | ||||
-rw-r--r-- | src/LibFetchRewindInputStream.cpp | 2 | ||||
-rw-r--r-- | src/MemoryFrontier.hpp | 3 | ||||
-rw-r--r-- | src/RewindInputStream.hpp | 15 | ||||
-rw-r--r-- | src/URL.hpp | 4 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 1 | ||||
-rw-r--r-- | streamhtmlparser/GNUmakefile | 2 | ||||
-rw-r--r-- | streamhtmlparser/htmlparser.h (renamed from streamhtmlparser/include/htmlparser.h) | 0 | ||||
-rw-r--r-- | streamhtmlparser/htmlparser_cpp.h (renamed from streamhtmlparser/include/htmlparser_cpp.h) | 0 | ||||
-rw-r--r-- | streamhtmlparser/jsparser.h (renamed from streamhtmlparser/include/jsparser.h) | 0 | ||||
-rw-r--r-- | streamhtmlparser/statemachine.h (renamed from streamhtmlparser/include/statemachine.h) | 0 | ||||
-rw-r--r-- | tests/streamhtmlparser/GNUmakefile | 2 |
15 files changed, 61 insertions, 15 deletions
diff --git a/README.3rdPARTY b/README.3rdPARTY index 11e5abd..15d237c 100644 --- a/README.3rdPARTY +++ b/README.3rdPARTY @@ -19,8 +19,7 @@ Copyright (c) 2006, Google Inc. http://code.google.com/p/streamhtmlparser/ - removed all python stuff, autoconf and automake/libtool -- slighlty rearanged directory structure (public header files - in 'include', library sources in main subdir) +- slighlty rearanged directory structure (all files in one directory) - eliminated some unused parameter warnings - removed some C++-ism in C-code (like empty function parameter lists) - made local functions actually 'static', e. g.: diff --git a/src/GNUmakefile b/src/GNUmakefile index b7d8651..8108a0c 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -9,10 +9,12 @@ INCLUDE_CPPFLAGS = \ INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ - -I$(TOPDIR)/libfetch + -I$(TOPDIR)/libfetch \ + -I$(TOPDIR)/streamhtmlparser INCLUDE_LIBS = \ - $(TOPDIR)/libfetch/libfetch.a + $(TOPDIR)/libfetch/libfetch.a \ + $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a # openssl ifeq ($(WITH_SSL),1) diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/HTMLLinkExtractProcessor.cpp index d982923..ff22562 100644 --- a/src/HTMLLinkExtractProcessor.cpp +++ b/src/HTMLLinkExtractProcessor.cpp @@ -1,11 +1,14 @@ #include "HTMLLinkExtractProcessor.hpp" +#include "Logger.hpp" #include <string> +#include <cstring> using namespace std; +using namespace streamhtmlparser; HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier ) - : m_frontier( frontier ) + : m_frontier( frontier ), m_parser( ) { } @@ -15,10 +18,32 @@ HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( ) void HTMLLinkExtractProcessor::process( RewindInputStream *s ) { - string line; - + string link; + char buf[1] = {0}; + bool in_link = false; + while( s->good( ) && !s->eof( ) ) { - getline( *s, line ); - cout << line << endl; + buf[0] = s->get( ); + m_parser.Parse( buf, 1 ); + + if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) { + if( strcmp( m_parser.tag( ), "a" ) == 0 && strcmp( m_parser.attribute( ), "href" ) == 0 ) { + link = m_parser.value( ); + in_link = true; + } + } else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) { + string absLink( s->getBaseUrl( ).str( ) ); + absLink.append( link ); + m_frontier->addUrl( absLink ); + link.clear( ); + in_link = false; + } else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) { + // TODO: proper error handling + cerr << endl << "ERROR at " << endl; + m_parser.Reset( ); + return; + } } + + m_parser.Reset( ); } diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/HTMLLinkExtractProcessor.hpp index e1fa05c..2777521 100644 --- a/src/HTMLLinkExtractProcessor.hpp +++ b/src/HTMLLinkExtractProcessor.hpp @@ -4,6 +4,8 @@ #include "Processor.hpp" #include "Frontier.hpp" +#include "htmlparser_cpp.h" + class HTMLLinkExtractProcessor : public Processor { public: HTMLLinkExtractProcessor( Frontier *frontier ); @@ -12,6 +14,7 @@ class HTMLLinkExtractProcessor : public Processor { protected: Frontier *m_frontier; + streamhtmlparser::HtmlParser m_parser; }; #endif diff --git a/src/LibFetchRewindInputStream.cpp b/src/LibFetchRewindInputStream.cpp index 79d1b24..4e837c8 100644 --- a/src/LibFetchRewindInputStream.cpp +++ b/src/LibFetchRewindInputStream.cpp @@ -60,7 +60,7 @@ streambuf::int_type libfetch_buffer::underflow( ) } LibFetchRewindInputStream::LibFetchRewindInputStream( const URL &url ) - : m_io( 0 ), m_buf( 0 ) + : RewindInputStream( url ), m_io( 0 ), m_buf( 0 ) { m_io = fetchGetURL( url.str( ).c_str( ), "" ); if( m_io == NULL ) { diff --git a/src/MemoryFrontier.hpp b/src/MemoryFrontier.hpp index d9d7647..46f8367 100644 --- a/src/MemoryFrontier.hpp +++ b/src/MemoryFrontier.hpp @@ -2,6 +2,8 @@ #define __MEMORY_FRONTIER_H #include "Frontier.hpp" +#include "Logger.hpp" + #include <queue> class MemoryFrontier : public Frontier { @@ -19,6 +21,7 @@ class MemoryFrontier : public Frontier { } void addUrl( const URL &url ) { + LOG( logINFO ) << "Adding to frontier " << url; m_urls.push( url ); } diff --git a/src/RewindInputStream.hpp b/src/RewindInputStream.hpp index 4a66ebd..9daafe4 100644 --- a/src/RewindInputStream.hpp +++ b/src/RewindInputStream.hpp @@ -1,9 +1,24 @@ #ifndef __REWIND_INPUT_STREAM_H #define __REWIND_INPUT_STREAM_H +#include "URL.hpp" + #include <iostream> class RewindInputStream : public std::istream { + public: + RewindInputStream( const URL &url ) + : m_baseUrl( url ) + { + } + + const URL getBaseUrl( ) const + { + return m_baseUrl; + } + + private: + URL m_baseUrl; }; #endif diff --git a/src/URL.hpp b/src/URL.hpp index 49a0a6e..b7c72c8 100644 --- a/src/URL.hpp +++ b/src/URL.hpp @@ -43,14 +43,14 @@ class URL { } template< typename CharT, typename TraitsT > friend - basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, URL& u ); + basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u ); protected: string m_url; }; template< typename CharT, typename TraitsT > -inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, URL& u ) { +inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u ) { s << u.m_url; return s; } diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index 0dca3f2..e96a855 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -2,7 +2,6 @@ #include "MemoryFrontier.hpp" #include "MD5Deduper.hpp" #include "HTMLLinkExtractProcessor.hpp" -#include "Logger.hpp" int main( void ) { diff --git a/streamhtmlparser/GNUmakefile b/streamhtmlparser/GNUmakefile index 100d8b2..ea5380d 100644 --- a/streamhtmlparser/GNUmakefile +++ b/streamhtmlparser/GNUmakefile @@ -9,7 +9,7 @@ INCLUDE_CFLAGS = INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ - -Iinclude + -I. INCLUDE_LIBS = diff --git a/streamhtmlparser/include/htmlparser.h b/streamhtmlparser/htmlparser.h index 58db4a5..58db4a5 100644 --- a/streamhtmlparser/include/htmlparser.h +++ b/streamhtmlparser/htmlparser.h diff --git a/streamhtmlparser/include/htmlparser_cpp.h b/streamhtmlparser/htmlparser_cpp.h index 3802233..3802233 100644 --- a/streamhtmlparser/include/htmlparser_cpp.h +++ b/streamhtmlparser/htmlparser_cpp.h diff --git a/streamhtmlparser/include/jsparser.h b/streamhtmlparser/jsparser.h index 4077aa4..4077aa4 100644 --- a/streamhtmlparser/include/jsparser.h +++ b/streamhtmlparser/jsparser.h diff --git a/streamhtmlparser/include/statemachine.h b/streamhtmlparser/statemachine.h index a05ffe7..a05ffe7 100644 --- a/streamhtmlparser/include/statemachine.h +++ b/streamhtmlparser/statemachine.h diff --git a/tests/streamhtmlparser/GNUmakefile b/tests/streamhtmlparser/GNUmakefile index de0c9a9..ad99173 100644 --- a/tests/streamhtmlparser/GNUmakefile +++ b/tests/streamhtmlparser/GNUmakefile @@ -3,7 +3,7 @@ TOPDIR = ../.. SUBDIRS = INCLUDE_DIRS = \ - -I$(TOPDIR)/streamhtmlparser/include + -I$(TOPDIR)/streamhtmlparser INCLUDE_LDFLAGS = |