diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 17:16:21 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 17:16:21 +0200 |
commit | 54cce110784d33d658b5f78286a98bee244a9eeb (patch) | |
tree | 9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /src | |
parent | fcb682cb1955d362390665330fdf476cab7dc10b (diff) | |
download | crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2 |
added streamhtmlparser
Diffstat (limited to 'src')
-rw-r--r-- | src/GNUmakefile | 3 | ||||
-rw-r--r-- | src/HTMLLinkExtractProcessor.cpp | 24 | ||||
-rw-r--r-- | src/HTMLLinkExtractProcessor.hpp | 17 | ||||
-rw-r--r-- | src/Processor.hpp | 3 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 7 |
5 files changed, 52 insertions, 2 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile index d9c8234..b7d8651 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -30,7 +30,8 @@ CPP_OBJS = \ LibFetchFetcher.o \ LibFetchRewindInputStream.o \ Frontier.o \ - Deduper.o + Deduper.o \ + HTMLLinkExtractProcessor.o CPP_BINS = \ crawlingwolf$(EXE) diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/HTMLLinkExtractProcessor.cpp new file mode 100644 index 0000000..d982923 --- /dev/null +++ b/src/HTMLLinkExtractProcessor.cpp @@ -0,0 +1,24 @@ +#include "HTMLLinkExtractProcessor.hpp" + +#include <string> + +using namespace std; + +HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier ) + : m_frontier( frontier ) +{ +} + +HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( ) +{ +} + +void HTMLLinkExtractProcessor::process( RewindInputStream *s ) +{ + string line; + + while( s->good( ) && !s->eof( ) ) { + getline( *s, line ); + cout << line << endl; + } +} diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/HTMLLinkExtractProcessor.hpp new file mode 100644 index 0000000..e1fa05c --- /dev/null +++ b/src/HTMLLinkExtractProcessor.hpp @@ -0,0 +1,17 @@ +#ifndef __HTML_LINK_EXTRACT_PROCESSOR_H +#define __HTML_LINK_EXTRACT_PROCESSOR_H + +#include "Processor.hpp" +#include "Frontier.hpp" + +class HTMLLinkExtractProcessor : public Processor { + public: + HTMLLinkExtractProcessor( Frontier *frontier ); + virtual ~HTMLLinkExtractProcessor( ); + virtual void process( RewindInputStream *s ); + + protected: + Frontier *m_frontier; +}; + +#endif diff --git a/src/Processor.hpp b/src/Processor.hpp index 6cbf667..b796e65 100644 --- a/src/Processor.hpp +++ b/src/Processor.hpp @@ -5,7 +5,8 @@ class Processor { public: - virtual process( RewindInputStream &s ) = 0; + virtual ~Processor( ) {}; + virtual void process( RewindInputStream *s ) = 0; }; #endif diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index dbf5c55..0dca3f2 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -1,6 +1,7 @@ #include "LibFetchFetcher.hpp" #include "MemoryFrontier.hpp" #include "MD5Deduper.hpp" +#include "HTMLLinkExtractProcessor.hpp" #include "Logger.hpp" int main( void ) @@ -8,6 +9,7 @@ int main( void ) Frontier *frontier = new MemoryFrontier( ); Fetcher *fetcher = new LibFetchFetcher( ); Deduper *deduper = new MD5Deduper( ); + Processor *processor = new HTMLLinkExtractProcessor( frontier ); LOG( logNOTICE ) << "Crawler started.."; @@ -17,14 +19,19 @@ int main( void ) while( ( url = frontier->getNextUrl( ) ) != URL::Null ) { LOG( logINFO ) << "Got URL " << url; RewindInputStream *s = fetcher->fetch( url ); + if( deduper->contentSeen( url, s ) ) { LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen"; delete s; continue; } + + processor->process( s ); + delete s; } + delete processor; delete deduper; delete fetcher; delete frontier; |