summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.3rdPARTY3
-rw-r--r--src/GNUmakefile6
-rw-r--r--src/HTMLLinkExtractProcessor.cpp35
-rw-r--r--src/HTMLLinkExtractProcessor.hpp3
-rw-r--r--src/LibFetchRewindInputStream.cpp2
-rw-r--r--src/MemoryFrontier.hpp3
-rw-r--r--src/RewindInputStream.hpp15
-rw-r--r--src/URL.hpp4
-rw-r--r--src/crawlingwolf.cpp1
-rw-r--r--streamhtmlparser/GNUmakefile2
-rw-r--r--streamhtmlparser/htmlparser.h (renamed from streamhtmlparser/include/htmlparser.h)0
-rw-r--r--streamhtmlparser/htmlparser_cpp.h (renamed from streamhtmlparser/include/htmlparser_cpp.h)0
-rw-r--r--streamhtmlparser/jsparser.h (renamed from streamhtmlparser/include/jsparser.h)0
-rw-r--r--streamhtmlparser/statemachine.h (renamed from streamhtmlparser/include/statemachine.h)0
-rw-r--r--tests/streamhtmlparser/GNUmakefile2
15 files changed, 61 insertions, 15 deletions
diff --git a/README.3rdPARTY b/README.3rdPARTY
index 11e5abd..15d237c 100644
--- a/README.3rdPARTY
+++ b/README.3rdPARTY
@@ -19,8 +19,7 @@ Copyright (c) 2006, Google Inc.
http://code.google.com/p/streamhtmlparser/
- removed all python stuff, autoconf and automake/libtool
-- slighlty rearanged directory structure (public header files
- in 'include', library sources in main subdir)
+- slighlty rearanged directory structure (all files in one directory)
- eliminated some unused parameter warnings
- removed some C++-ism in C-code (like empty function parameter lists)
- made local functions actually 'static', e. g.:
diff --git a/src/GNUmakefile b/src/GNUmakefile
index b7d8651..8108a0c 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -9,10 +9,12 @@ INCLUDE_CPPFLAGS = \
INCLUDE_LDFLAGS = \
INCLUDE_DIRS = \
- -I$(TOPDIR)/libfetch
+ -I$(TOPDIR)/libfetch \
+ -I$(TOPDIR)/streamhtmlparser
INCLUDE_LIBS = \
- $(TOPDIR)/libfetch/libfetch.a
+ $(TOPDIR)/libfetch/libfetch.a \
+ $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a
# openssl
ifeq ($(WITH_SSL),1)
diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/HTMLLinkExtractProcessor.cpp
index d982923..ff22562 100644
--- a/src/HTMLLinkExtractProcessor.cpp
+++ b/src/HTMLLinkExtractProcessor.cpp
@@ -1,11 +1,14 @@
#include "HTMLLinkExtractProcessor.hpp"
+#include "Logger.hpp"
#include <string>
+#include <cstring>
using namespace std;
+using namespace streamhtmlparser;
HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier )
- : m_frontier( frontier )
+ : m_frontier( frontier ), m_parser( )
{
}
@@ -15,10 +18,32 @@ HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( )
void HTMLLinkExtractProcessor::process( RewindInputStream *s )
{
- string line;
-
+ string link;
+ char buf[1] = {0};
+ bool in_link = false;
+
while( s->good( ) && !s->eof( ) ) {
- getline( *s, line );
- cout << line << endl;
+ buf[0] = s->get( );
+ m_parser.Parse( buf, 1 );
+
+ if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) {
+ if( strcmp( m_parser.tag( ), "a" ) == 0 && strcmp( m_parser.attribute( ), "href" ) == 0 ) {
+ link = m_parser.value( );
+ in_link = true;
+ }
+ } else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) {
+ string absLink( s->getBaseUrl( ).str( ) );
+ absLink.append( link );
+ m_frontier->addUrl( absLink );
+ link.clear( );
+ in_link = false;
+ } else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) {
+ // TODO: proper error handling
+ cerr << endl << "ERROR at " << endl;
+ m_parser.Reset( );
+ return;
+ }
}
+
+ m_parser.Reset( );
}
diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/HTMLLinkExtractProcessor.hpp
index e1fa05c..2777521 100644
--- a/src/HTMLLinkExtractProcessor.hpp
+++ b/src/HTMLLinkExtractProcessor.hpp
@@ -4,6 +4,8 @@
#include "Processor.hpp"
#include "Frontier.hpp"
+#include "htmlparser_cpp.h"
+
class HTMLLinkExtractProcessor : public Processor {
public:
HTMLLinkExtractProcessor( Frontier *frontier );
@@ -12,6 +14,7 @@ class HTMLLinkExtractProcessor : public Processor {
protected:
Frontier *m_frontier;
+ streamhtmlparser::HtmlParser m_parser;
};
#endif
diff --git a/src/LibFetchRewindInputStream.cpp b/src/LibFetchRewindInputStream.cpp
index 79d1b24..4e837c8 100644
--- a/src/LibFetchRewindInputStream.cpp
+++ b/src/LibFetchRewindInputStream.cpp
@@ -60,7 +60,7 @@ streambuf::int_type libfetch_buffer::underflow( )
}
LibFetchRewindInputStream::LibFetchRewindInputStream( const URL &url )
- : m_io( 0 ), m_buf( 0 )
+ : RewindInputStream( url ), m_io( 0 ), m_buf( 0 )
{
m_io = fetchGetURL( url.str( ).c_str( ), "" );
if( m_io == NULL ) {
diff --git a/src/MemoryFrontier.hpp b/src/MemoryFrontier.hpp
index d9d7647..46f8367 100644
--- a/src/MemoryFrontier.hpp
+++ b/src/MemoryFrontier.hpp
@@ -2,6 +2,8 @@
#define __MEMORY_FRONTIER_H
#include "Frontier.hpp"
+#include "Logger.hpp"
+
#include <queue>
class MemoryFrontier : public Frontier {
@@ -19,6 +21,7 @@ class MemoryFrontier : public Frontier {
}
void addUrl( const URL &url ) {
+ LOG( logINFO ) << "Adding to frontier " << url;
m_urls.push( url );
}
diff --git a/src/RewindInputStream.hpp b/src/RewindInputStream.hpp
index 4a66ebd..9daafe4 100644
--- a/src/RewindInputStream.hpp
+++ b/src/RewindInputStream.hpp
@@ -1,9 +1,24 @@
#ifndef __REWIND_INPUT_STREAM_H
#define __REWIND_INPUT_STREAM_H
+#include "URL.hpp"
+
#include <iostream>
class RewindInputStream : public std::istream {
+ public:
+ RewindInputStream( const URL &url )
+ : m_baseUrl( url )
+ {
+ }
+
+ const URL getBaseUrl( ) const
+ {
+ return m_baseUrl;
+ }
+
+ private:
+ URL m_baseUrl;
};
#endif
diff --git a/src/URL.hpp b/src/URL.hpp
index 49a0a6e..b7c72c8 100644
--- a/src/URL.hpp
+++ b/src/URL.hpp
@@ -43,14 +43,14 @@ class URL {
}
template< typename CharT, typename TraitsT > friend
- basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, URL& u );
+ basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u );
protected:
string m_url;
};
template< typename CharT, typename TraitsT >
-inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, URL& u ) {
+inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u ) {
s << u.m_url;
return s;
}
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 0dca3f2..e96a855 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -2,7 +2,6 @@
#include "MemoryFrontier.hpp"
#include "MD5Deduper.hpp"
#include "HTMLLinkExtractProcessor.hpp"
-#include "Logger.hpp"
int main( void )
{
diff --git a/streamhtmlparser/GNUmakefile b/streamhtmlparser/GNUmakefile
index 100d8b2..ea5380d 100644
--- a/streamhtmlparser/GNUmakefile
+++ b/streamhtmlparser/GNUmakefile
@@ -9,7 +9,7 @@ INCLUDE_CFLAGS =
INCLUDE_LDFLAGS = \
INCLUDE_DIRS = \
- -Iinclude
+ -I.
INCLUDE_LIBS =
diff --git a/streamhtmlparser/include/htmlparser.h b/streamhtmlparser/htmlparser.h
index 58db4a5..58db4a5 100644
--- a/streamhtmlparser/include/htmlparser.h
+++ b/streamhtmlparser/htmlparser.h
diff --git a/streamhtmlparser/include/htmlparser_cpp.h b/streamhtmlparser/htmlparser_cpp.h
index 3802233..3802233 100644
--- a/streamhtmlparser/include/htmlparser_cpp.h
+++ b/streamhtmlparser/htmlparser_cpp.h
diff --git a/streamhtmlparser/include/jsparser.h b/streamhtmlparser/jsparser.h
index 4077aa4..4077aa4 100644
--- a/streamhtmlparser/include/jsparser.h
+++ b/streamhtmlparser/jsparser.h
diff --git a/streamhtmlparser/include/statemachine.h b/streamhtmlparser/statemachine.h
index a05ffe7..a05ffe7 100644
--- a/streamhtmlparser/include/statemachine.h
+++ b/streamhtmlparser/statemachine.h
diff --git a/tests/streamhtmlparser/GNUmakefile b/tests/streamhtmlparser/GNUmakefile
index de0c9a9..ad99173 100644
--- a/tests/streamhtmlparser/GNUmakefile
+++ b/tests/streamhtmlparser/GNUmakefile
@@ -3,7 +3,7 @@ TOPDIR = ../..
SUBDIRS =
INCLUDE_DIRS = \
- -I$(TOPDIR)/streamhtmlparser/include
+ -I$(TOPDIR)/streamhtmlparser
INCLUDE_LDFLAGS =