summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/DomainURLFilter.cpp7
-rw-r--r--src/GNUmakefile3
-rw-r--r--src/HTMLLinkExtractProcessor.cpp22
-rw-r--r--src/HTMLLinkExtractProcessor.hpp4
-rw-r--r--src/MemoryFrontier.hpp2
-rw-r--r--src/MemoryURLSeen.cpp22
-rw-r--r--src/MemoryURLSeen.hpp18
-rw-r--r--src/ProtocolURLFilter.cpp7
-rw-r--r--src/URLSeen.hpp4
-rw-r--r--src/crawlingwolf.cpp5
10 files changed, 75 insertions, 19 deletions
diff --git a/src/DomainURLFilter.cpp b/src/DomainURLFilter.cpp
index 40d7804..7234f34 100644
--- a/src/DomainURLFilter.cpp
+++ b/src/DomainURLFilter.cpp
@@ -11,8 +11,11 @@ bool DomainURLFilter::filter( const URL &url )
string domain = url.domain( );
bool res = ( m_domains.find( domain ) != m_domains.end( ) );
- LOG( logNOTICE ) << "Checking for domain '"
- << domain << "' in '" << url << "': " << res;
+ LOG( logDEBUG ) << "Checking for domain '" << domain << "' in '" << url << "'";
+
+ LOG( logINFO ) << ( res ? "Including " : "Excluding " )
+ << "'" << url << "' "
+ << "for domain '" << domain << "'";
return res;
}
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 7ef583b..823c7be 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -36,7 +36,8 @@ CPP_OBJS = \
HTMLLinkExtractProcessor.o \
ProtocolURLFilter.o \
DomainURLFilter.o \
- ChainURLFilter.o
+ ChainURLFilter.o \
+ MemoryURLSeen.o
CPP_BINS = \
crawlingwolf$(EXE)
diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/HTMLLinkExtractProcessor.cpp
index b71e88a..e9bb011 100644
--- a/src/HTMLLinkExtractProcessor.cpp
+++ b/src/HTMLLinkExtractProcessor.cpp
@@ -7,8 +7,8 @@
using namespace std;
using namespace streamhtmlparser;
-HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter )
- : m_frontier( frontier ), m_filter( filter ), m_parser( )
+HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter, URLSeen *urlSeen )
+ : m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ), m_parser( )
{
}
@@ -45,18 +45,22 @@ void HTMLLinkExtractProcessor::process( RewindInputStream *s )
in_link = true;
}
} else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) {
+ string absoluteLink;
+
if( link.substr( 0, 7 ) == "http://" ||
link.substr( 0, 8 ) == "https://" ) {
- if( m_filter->filter( link ) ) {
- m_frontier->addUrl( link );
- }
+ absoluteLink = link;
} else {
- string absLink( s->getBaseUrl( ).str( ) );
- absLink.append( link );
- if( m_filter->filter( absLink ) ) {
- m_frontier->addUrl( absLink );
+ absoluteLink = s->getBaseUrl( ).str( );
+ absoluteLink.append( link );
+ }
+
+ if( m_filter->filter( absoluteLink ) ) {
+ if( !m_urlSeen->seen( absoluteLink ) ) {
+ m_frontier->addUrl( absoluteLink );
}
}
+
link.clear( );
in_link = false;
} else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) {
diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/HTMLLinkExtractProcessor.hpp
index 9d2c579..eb229ef 100644
--- a/src/HTMLLinkExtractProcessor.hpp
+++ b/src/HTMLLinkExtractProcessor.hpp
@@ -4,18 +4,20 @@
#include "Processor.hpp"
#include "Frontier.hpp"
#include "URLFilter.hpp"
+#include "URLSeen.hpp"
#include "htmlparser_cpp.h"
class HTMLLinkExtractProcessor : public Processor {
public:
- HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter );
+ HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter, URLSeen *urlSeen );
virtual ~HTMLLinkExtractProcessor( );
virtual void process( RewindInputStream *s );
protected:
Frontier *m_frontier;
URLFilter *m_filter;
+ URLSeen *m_urlSeen;
streamhtmlparser::HtmlParser m_parser;
};
diff --git a/src/MemoryFrontier.hpp b/src/MemoryFrontier.hpp
index 46f8367..9e0139d 100644
--- a/src/MemoryFrontier.hpp
+++ b/src/MemoryFrontier.hpp
@@ -21,7 +21,7 @@ class MemoryFrontier : public Frontier {
}
void addUrl( const URL &url ) {
- LOG( logINFO ) << "Adding to frontier " << url;
+ LOG( logDEBUG ) << "Adding to frontier " << url;
m_urls.push( url );
}
diff --git a/src/MemoryURLSeen.cpp b/src/MemoryURLSeen.cpp
new file mode 100644
index 0000000..3d14aa2
--- /dev/null
+++ b/src/MemoryURLSeen.cpp
@@ -0,0 +1,22 @@
+#include "MemoryURLSeen.hpp"
+#include "Logger.hpp"
+
+MemoryURLSeen::MemoryURLSeen( )
+{
+}
+
+bool MemoryURLSeen::seen( const URL url )
+{
+ bool hasSeen = false;
+
+ if( m_urls.find( url ) != m_urls.end( ) ) {
+ hasSeen = true;
+ }
+
+ LOG( logDEBUG ) << "Checking if URL '" << url << "' has been seen before"
+ << ( hasSeen ? ", already seen" : ", no, is new" );
+
+ m_urls.insert( url );
+
+ return hasSeen;
+}
diff --git a/src/MemoryURLSeen.hpp b/src/MemoryURLSeen.hpp
new file mode 100644
index 0000000..6e6ccbd
--- /dev/null
+++ b/src/MemoryURLSeen.hpp
@@ -0,0 +1,18 @@
+#ifndef __MEMORY_URLSEEN_H
+#define __MEMORY_URLSEEN_H
+
+#include "URLSeen.hpp"
+
+#include <set>
+
+class MemoryURLSeen : public URLSeen {
+ public:
+ MemoryURLSeen( );
+
+ virtual bool seen( const URL url );
+
+ protected:
+ set<URL> m_urls;
+};
+
+#endif
diff --git a/src/ProtocolURLFilter.cpp b/src/ProtocolURLFilter.cpp
index 2e7ed2b..2566a8c 100644
--- a/src/ProtocolURLFilter.cpp
+++ b/src/ProtocolURLFilter.cpp
@@ -11,8 +11,11 @@ bool ProtocolURLFilter::filter( const URL &url )
string protocol = url.protocol( );
bool res = ( m_protocols.find( protocol ) != m_protocols.end( ) );
- LOG( logNOTICE ) << "Checking for protocol '"
- << protocol << "' in '" << url << "': " << res;
+ LOG( logDEBUG ) << "Checking for protocol '" << protocol << "' in '" << url << "'";
+ LOG( logINFO ) << ( res ? "Including " : "Excluding " )
+ << "'" << url << "' "
+ << "for protocol '" << protocol << "'";
+
return res;
}
diff --git a/src/URLSeen.hpp b/src/URLSeen.hpp
index b07cac8..742c863 100644
--- a/src/URLSeen.hpp
+++ b/src/URLSeen.hpp
@@ -2,11 +2,11 @@
#define __URLSEEN_H
#include "URL.hpp"
-#include "RewindInputStream.hpp"
class URLSeen {
public:
- virtual bool seen( const URL &url, RewindInputStream &s ) = 0;
+ virtual ~URLSeen( ) { };
+ virtual bool seen( const URL url ) = 0;
};
#endif
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 2f4e067..ee002f4 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -5,6 +5,7 @@
#include "ChainURLFilter.hpp"
#include "ProtocolURLFilter.hpp"
#include "DomainURLFilter.hpp"
+#include "MemoryURLSeen.hpp"
#include <set>
@@ -15,6 +16,7 @@ int main( void )
Frontier *frontier = new MemoryFrontier( );
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
+ URLSeen *urlSeen = new MemoryURLSeen( );
set<string> protocols;
protocols.insert( "http" );
@@ -27,7 +29,7 @@ int main( void )
ChainURLFilter filters( &protocolFilter, &domainFilter );
- Processor *processor = new HTMLLinkExtractProcessor( frontier, &filters );
+ Processor *processor = new HTMLLinkExtractProcessor( frontier, &filters, urlSeen );
LOG( logNOTICE ) << "Crawler started..";
@@ -50,6 +52,7 @@ int main( void )
}
delete processor;
+ delete urlSeen;
delete deduper;
delete fetcher;
delete frontier;