diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-18 16:17:57 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-18 16:17:57 +0200 |
commit | 0fe31b20938b880db13cbb0a0b7e416d7ca15138 (patch) | |
tree | 1309d1564d4848b192fafa4b9c410879b55286de /src | |
parent | f49c6fe1f7f6cbb3fc7f9ea17cbf47e00d07aa4f (diff) | |
download | crawler-0fe31b20938b880db13cbb0a0b7e416d7ca15138.tar.gz crawler-0fe31b20938b880db13cbb0a0b7e416d7ca15138.tar.bz2 |
added URLSeen component
Diffstat (limited to 'src')
-rw-r--r-- | src/DomainURLFilter.cpp | 7 | ||||
-rw-r--r-- | src/GNUmakefile | 3 | ||||
-rw-r--r-- | src/HTMLLinkExtractProcessor.cpp | 22 | ||||
-rw-r--r-- | src/HTMLLinkExtractProcessor.hpp | 4 | ||||
-rw-r--r-- | src/MemoryFrontier.hpp | 2 | ||||
-rw-r--r-- | src/MemoryURLSeen.cpp | 22 | ||||
-rw-r--r-- | src/MemoryURLSeen.hpp | 18 | ||||
-rw-r--r-- | src/ProtocolURLFilter.cpp | 7 | ||||
-rw-r--r-- | src/URLSeen.hpp | 4 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 5 |
10 files changed, 75 insertions, 19 deletions
diff --git a/src/DomainURLFilter.cpp b/src/DomainURLFilter.cpp index 40d7804..7234f34 100644 --- a/src/DomainURLFilter.cpp +++ b/src/DomainURLFilter.cpp @@ -11,8 +11,11 @@ bool DomainURLFilter::filter( const URL &url ) string domain = url.domain( ); bool res = ( m_domains.find( domain ) != m_domains.end( ) ); - LOG( logNOTICE ) << "Checking for domain '" - << domain << "' in '" << url << "': " << res; + LOG( logDEBUG ) << "Checking for domain '" << domain << "' in '" << url << "'"; + + LOG( logINFO ) << ( res ? "Including " : "Excluding " ) + << "'" << url << "' " + << "for domain '" << domain << "'"; return res; } diff --git a/src/GNUmakefile b/src/GNUmakefile index 7ef583b..823c7be 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -36,7 +36,8 @@ CPP_OBJS = \ HTMLLinkExtractProcessor.o \ ProtocolURLFilter.o \ DomainURLFilter.o \ - ChainURLFilter.o + ChainURLFilter.o \ + MemoryURLSeen.o CPP_BINS = \ crawlingwolf$(EXE) diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/HTMLLinkExtractProcessor.cpp index b71e88a..e9bb011 100644 --- a/src/HTMLLinkExtractProcessor.cpp +++ b/src/HTMLLinkExtractProcessor.cpp @@ -7,8 +7,8 @@ using namespace std; using namespace streamhtmlparser; -HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter ) - : m_frontier( frontier ), m_filter( filter ), m_parser( ) +HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ) + : m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ), m_parser( ) { } @@ -45,18 +45,22 @@ void HTMLLinkExtractProcessor::process( RewindInputStream *s ) in_link = true; } } else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) { + string absoluteLink; + if( link.substr( 0, 7 ) == "http://" || link.substr( 0, 8 ) == "https://" ) { - if( m_filter->filter( link ) ) { - m_frontier->addUrl( link ); - } + absoluteLink = link; } else { - string absLink( s->getBaseUrl( ).str( ) ); - absLink.append( link ); - if( m_filter->filter( absLink ) ) { - m_frontier->addUrl( absLink ); + absoluteLink = s->getBaseUrl( ).str( ); + absoluteLink.append( link ); + } + + if( m_filter->filter( absoluteLink ) ) { + if( !m_urlSeen->seen( absoluteLink ) ) { + m_frontier->addUrl( absoluteLink ); } } + link.clear( ); in_link = false; } else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) { diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/HTMLLinkExtractProcessor.hpp index 9d2c579..eb229ef 100644 --- a/src/HTMLLinkExtractProcessor.hpp +++ b/src/HTMLLinkExtractProcessor.hpp @@ -4,18 +4,20 @@ #include "Processor.hpp" #include "Frontier.hpp" #include "URLFilter.hpp" +#include "URLSeen.hpp" #include "htmlparser_cpp.h" class HTMLLinkExtractProcessor : public Processor { public: - HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter ); + HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter, URLSeen *urlSeen ); virtual ~HTMLLinkExtractProcessor( ); virtual void process( RewindInputStream *s ); protected: Frontier *m_frontier; URLFilter *m_filter; + URLSeen *m_urlSeen; streamhtmlparser::HtmlParser m_parser; }; diff --git a/src/MemoryFrontier.hpp b/src/MemoryFrontier.hpp index 46f8367..9e0139d 100644 --- a/src/MemoryFrontier.hpp +++ b/src/MemoryFrontier.hpp @@ -21,7 +21,7 @@ class MemoryFrontier : public Frontier { } void addUrl( const URL &url ) { - LOG( logINFO ) << "Adding to frontier " << url; + LOG( logDEBUG ) << "Adding to frontier " << url; m_urls.push( url ); } diff --git a/src/MemoryURLSeen.cpp b/src/MemoryURLSeen.cpp new file mode 100644 index 0000000..3d14aa2 --- /dev/null +++ b/src/MemoryURLSeen.cpp @@ -0,0 +1,22 @@ +#include "MemoryURLSeen.hpp" +#include "Logger.hpp" + +MemoryURLSeen::MemoryURLSeen( ) +{ +} + +bool MemoryURLSeen::seen( const URL url ) +{ + bool hasSeen = false; + + if( m_urls.find( url ) != m_urls.end( ) ) { + hasSeen = true; + } + + LOG( logDEBUG ) << "Checking if URL '" << url << "' has been seen before" + << ( hasSeen ? ", already seen" : ", no, is new" ); + + m_urls.insert( url ); + + return hasSeen; +} diff --git a/src/MemoryURLSeen.hpp b/src/MemoryURLSeen.hpp new file mode 100644 index 0000000..6e6ccbd --- /dev/null +++ b/src/MemoryURLSeen.hpp @@ -0,0 +1,18 @@ +#ifndef __MEMORY_URLSEEN_H +#define __MEMORY_URLSEEN_H + +#include "URLSeen.hpp" + +#include <set> + +class MemoryURLSeen : public URLSeen { + public: + MemoryURLSeen( ); + + virtual bool seen( const URL url ); + + protected: + set<URL> m_urls; +}; + +#endif diff --git a/src/ProtocolURLFilter.cpp b/src/ProtocolURLFilter.cpp index 2e7ed2b..2566a8c 100644 --- a/src/ProtocolURLFilter.cpp +++ b/src/ProtocolURLFilter.cpp @@ -11,8 +11,11 @@ bool ProtocolURLFilter::filter( const URL &url ) string protocol = url.protocol( ); bool res = ( m_protocols.find( protocol ) != m_protocols.end( ) ); - LOG( logNOTICE ) << "Checking for protocol '" - << protocol << "' in '" << url << "': " << res; + LOG( logDEBUG ) << "Checking for protocol '" << protocol << "' in '" << url << "'"; + LOG( logINFO ) << ( res ? "Including " : "Excluding " ) + << "'" << url << "' " + << "for protocol '" << protocol << "'"; + return res; } diff --git a/src/URLSeen.hpp b/src/URLSeen.hpp index b07cac8..742c863 100644 --- a/src/URLSeen.hpp +++ b/src/URLSeen.hpp @@ -2,11 +2,11 @@ #define __URLSEEN_H #include "URL.hpp" -#include "RewindInputStream.hpp" class URLSeen { public: - virtual bool seen( const URL &url, RewindInputStream &s ) = 0; + virtual ~URLSeen( ) { }; + virtual bool seen( const URL url ) = 0; }; #endif diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index 2f4e067..ee002f4 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -5,6 +5,7 @@ #include "ChainURLFilter.hpp" #include "ProtocolURLFilter.hpp" #include "DomainURLFilter.hpp" +#include "MemoryURLSeen.hpp" #include <set> @@ -15,6 +16,7 @@ int main( void ) Frontier *frontier = new MemoryFrontier( ); Fetcher *fetcher = new LibFetchFetcher( ); Deduper *deduper = new MD5Deduper( ); + URLSeen *urlSeen = new MemoryURLSeen( ); set<string> protocols; protocols.insert( "http" ); @@ -27,7 +29,7 @@ int main( void ) ChainURLFilter filters( &protocolFilter, &domainFilter ); - Processor *processor = new HTMLLinkExtractProcessor( frontier, &filters ); + Processor *processor = new HTMLLinkExtractProcessor( frontier, &filters, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; @@ -50,6 +52,7 @@ int main( void ) } delete processor; + delete urlSeen; delete deduper; delete fetcher; delete frontier; |