summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-14 17:16:21 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-14 17:16:21 +0200
commit54cce110784d33d658b5f78286a98bee244a9eeb (patch)
tree9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /src
parentfcb682cb1955d362390665330fdf476cab7dc10b (diff)
downloadcrawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz
crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2
added streamhtmlparser
Diffstat (limited to 'src')
-rw-r--r--src/GNUmakefile3
-rw-r--r--src/HTMLLinkExtractProcessor.cpp24
-rw-r--r--src/HTMLLinkExtractProcessor.hpp17
-rw-r--r--src/Processor.hpp3
-rw-r--r--src/crawlingwolf.cpp7
5 files changed, 52 insertions, 2 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index d9c8234..b7d8651 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -30,7 +30,8 @@ CPP_OBJS = \
LibFetchFetcher.o \
LibFetchRewindInputStream.o \
Frontier.o \
- Deduper.o
+ Deduper.o \
+ HTMLLinkExtractProcessor.o
CPP_BINS = \
crawlingwolf$(EXE)
diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/HTMLLinkExtractProcessor.cpp
new file mode 100644
index 0000000..d982923
--- /dev/null
+++ b/src/HTMLLinkExtractProcessor.cpp
@@ -0,0 +1,24 @@
+#include "HTMLLinkExtractProcessor.hpp"
+
+#include <string>
+
+using namespace std;
+
+HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier )
+ : m_frontier( frontier )
+{
+}
+
+HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( )
+{
+}
+
+void HTMLLinkExtractProcessor::process( RewindInputStream *s )
+{
+ string line;
+
+ while( s->good( ) && !s->eof( ) ) {
+ getline( *s, line );
+ cout << line << endl;
+ }
+}
diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/HTMLLinkExtractProcessor.hpp
new file mode 100644
index 0000000..e1fa05c
--- /dev/null
+++ b/src/HTMLLinkExtractProcessor.hpp
@@ -0,0 +1,17 @@
+#ifndef __HTML_LINK_EXTRACT_PROCESSOR_H
+#define __HTML_LINK_EXTRACT_PROCESSOR_H
+
+#include "Processor.hpp"
+#include "Frontier.hpp"
+
+class HTMLLinkExtractProcessor : public Processor {
+ public:
+ HTMLLinkExtractProcessor( Frontier *frontier );
+ virtual ~HTMLLinkExtractProcessor( );
+ virtual void process( RewindInputStream *s );
+
+ protected:
+ Frontier *m_frontier;
+};
+
+#endif
diff --git a/src/Processor.hpp b/src/Processor.hpp
index 6cbf667..b796e65 100644
--- a/src/Processor.hpp
+++ b/src/Processor.hpp
@@ -5,7 +5,8 @@
class Processor {
public:
- virtual process( RewindInputStream &s ) = 0;
+ virtual ~Processor( ) {};
+ virtual void process( RewindInputStream *s ) = 0;
};
#endif
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index dbf5c55..0dca3f2 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -1,6 +1,7 @@
#include "LibFetchFetcher.hpp"
#include "MemoryFrontier.hpp"
#include "MD5Deduper.hpp"
+#include "HTMLLinkExtractProcessor.hpp"
#include "Logger.hpp"
int main( void )
@@ -8,6 +9,7 @@ int main( void )
Frontier *frontier = new MemoryFrontier( );
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
+ Processor *processor = new HTMLLinkExtractProcessor( frontier );
LOG( logNOTICE ) << "Crawler started..";
@@ -17,14 +19,19 @@ int main( void )
while( ( url = frontier->getNextUrl( ) ) != URL::Null ) {
LOG( logINFO ) << "Got URL " << url;
RewindInputStream *s = fetcher->fetch( url );
+
if( deduper->contentSeen( url, s ) ) {
LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen";
delete s;
continue;
}
+
+ processor->process( s );
+
delete s;
}
+ delete processor;
delete deduper;
delete fetcher;
delete frontier;