summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-15 19:51:01 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-15 19:51:01 +0200
commit69f32e239eafcfdc2392a8644c6de6ca6dbe83c1 (patch)
tree1f1776b2317df13f3a02069ca8b2ebbfb2ab2f80
parentbcb4f8f59bc1c522d2c677d8cffb4adbff45aa26 (diff)
downloadcrawler-69f32e239eafcfdc2392a8644c6de6ca6dbe83c1.tar.gz
crawler-69f32e239eafcfdc2392a8644c6de6ca6dbe83c1.tar.bz2
started to add URL filters
-rw-r--r--makefiles/gmake/help.mk8
-rw-r--r--src/ChainURLFilter.cpp38
-rw-r--r--src/ChainURLFilter.hpp22
-rw-r--r--src/DomainURLFilter.cpp11
-rw-r--r--src/DomainURLFilter.hpp19
-rw-r--r--src/GNUmakefile5
-rw-r--r--src/HTMLLinkExtractProcessor.cpp30
-rw-r--r--src/HTMLLinkExtractProcessor.hpp4
-rw-r--r--src/ProtocolURLFilter.cpp11
-rw-r--r--src/ProtocolURLFilter.hpp19
-rw-r--r--src/RewindInputStream.hpp5
-rw-r--r--src/URL.hpp20
-rw-r--r--src/URLFilter.hpp5
-rw-r--r--src/crawlingwolf.cpp23
14 files changed, 208 insertions, 12 deletions
diff --git a/makefiles/gmake/help.mk b/makefiles/gmake/help.mk
index 385d99f..2904c39 100644
--- a/makefiles/gmake/help.mk
+++ b/makefiles/gmake/help.mk
@@ -36,10 +36,16 @@ WITH_LOCAL_STREAMHTMLPARSER=1 use Google stream HTML 4 parser
WITH_LIBXML2=1 build the libxml2 parser
+scripting support:
+
+WITH_LUA=1 use Lua for configuration and scripting
+
Some more obscure options:
ENABLE_NLS=0 Don't build gettext NLS support (default is on)
Example:
make WITH_SSL=1 WITH_SQLITE3=1 WITH_PGSQL=1 \
- WITH_LOCAL_LIBFETCH=1 WITH_LIBXML2=1
+ WITH_LOCAL_LIBFETCH=1 WITH_LIBXML2=1 \
+ WITH_LUA=1
+
diff --git a/src/ChainURLFilter.cpp b/src/ChainURLFilter.cpp
new file mode 100644
index 0000000..e367c14
--- /dev/null
+++ b/src/ChainURLFilter.cpp
@@ -0,0 +1,38 @@
+#include "ChainURLFilter.hpp"
+
+ChainURLFilter::ChainURLFilter( )
+ : m_filters( )
+{
+}
+
+ChainURLFilter::ChainURLFilter( URLFilter *f1 )
+ : m_filters( )
+{
+ m_filters.push_back( f1 );
+}
+
+ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2 )
+ : m_filters( )
+{
+ m_filters.push_back( f1 );
+ m_filters.push_back( f2 );
+}
+
+ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 )
+ : m_filters( )
+{
+ m_filters.push_back( f1 );
+ m_filters.push_back( f2 );
+ m_filters.push_back( f3 );
+}
+
+bool ChainURLFilter::filter( const URL &url )
+{
+ list<URLFilter *>::const_iterator it;
+
+ for( it = m_filters.begin( ); it != m_filters.end( ); it++ ) {
+ if( !( (*it)->filter( url ) ) ) return false;
+ }
+
+ return true;
+}
diff --git a/src/ChainURLFilter.hpp b/src/ChainURLFilter.hpp
new file mode 100644
index 0000000..d6b2580
--- /dev/null
+++ b/src/ChainURLFilter.hpp
@@ -0,0 +1,22 @@
+#ifndef __CHAIN_URLFILTER_H
+#define __CHAIN_URLFILTER_H
+
+#include "URLFilter.hpp"
+
+#include <list>
+
+class ChainURLFilter : public URLFilter
+{
+ public:
+ ChainURLFilter( );
+ ChainURLFilter( URLFilter *f1 );
+ ChainURLFilter( URLFilter *f1, URLFilter *f2 );
+ ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 );
+
+ virtual bool filter( const URL &url );
+
+ protected:
+ std::list<URLFilter *> m_filters;
+};
+
+#endif
diff --git a/src/DomainURLFilter.cpp b/src/DomainURLFilter.cpp
new file mode 100644
index 0000000..5f42de2
--- /dev/null
+++ b/src/DomainURLFilter.cpp
@@ -0,0 +1,11 @@
+#include "DomainURLFilter.hpp"
+
+DomainURLFilter::DomainURLFilter( const std::set<std::string> domains )
+ : m_domains( domains )
+{
+}
+
+bool DomainURLFilter::filter( const URL &url )
+{
+ return( m_domains.find( url.domain( ) ) != m_domains.end( ) );
+}
diff --git a/src/DomainURLFilter.hpp b/src/DomainURLFilter.hpp
new file mode 100644
index 0000000..76dbc73
--- /dev/null
+++ b/src/DomainURLFilter.hpp
@@ -0,0 +1,19 @@
+#ifndef __DOMAIN_URLLFILTER_H
+#define __DOMAIN_URLFILTER_H
+
+#include "URLFilter.hpp"
+
+#include <set>
+
+class DomainURLFilter : public URLFilter
+{
+ public:
+ DomainURLFilter( const std::set<std::string> domains );
+
+ virtual bool filter( const URL &url );
+
+ protected:
+ std::set<std::string> m_domains;
+};
+
+#endif
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 8108a0c..7ef583b 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -33,7 +33,10 @@ CPP_OBJS = \
LibFetchRewindInputStream.o \
Frontier.o \
Deduper.o \
- HTMLLinkExtractProcessor.o
+ HTMLLinkExtractProcessor.o \
+ ProtocolURLFilter.o \
+ DomainURLFilter.o \
+ ChainURLFilter.o
CPP_BINS = \
crawlingwolf$(EXE)
diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/HTMLLinkExtractProcessor.cpp
index e956017..17a7b20 100644
--- a/src/HTMLLinkExtractProcessor.cpp
+++ b/src/HTMLLinkExtractProcessor.cpp
@@ -7,8 +7,8 @@
using namespace std;
using namespace streamhtmlparser;
-HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier )
- : m_frontier( frontier ), m_parser( )
+HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter )
+ : m_frontier( frontier ), m_filter( filter ), m_parser( )
{
}
@@ -27,17 +27,35 @@ void HTMLLinkExtractProcessor::process( RewindInputStream *s )
m_parser.Parse( buf, 1 );
if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) {
- if( strcmp( m_parser.tag( ), "a" ) == 0 && strcmp( m_parser.attribute( ), "href" ) == 0 ) {
+ if( strcmp( m_parser.tag( ), "base" ) == 0 &&
+ strcmp( m_parser.attribute( ), "href" ) == 0 ) {
+ s->setBaseUrl( string( m_parser.value( ) ) );
+ }
+ if( ( ( strcmp( m_parser.tag( ), "a" ) == 0 ||
+ strcmp( m_parser.tag( ), "area" ) == 0 ||
+ strcmp( m_parser.tag( ), "link" ) == 0 ) &&
+ strcmp( m_parser.attribute( ), "href" ) == 0 ) ||
+ ( ( strcmp( m_parser.tag( ), "img" ) == 0 ||
+ strcmp( m_parser.tag( ), "frame" ) == 0 ||
+ strcmp( m_parser.tag( ), "iframe" ) == 0 ||
+ strcmp( m_parser.tag( ), "embed" ) == 0 ) &&
+ strcmp( m_parser.attribute( ), "src" ) == 0 )
+ ) {
link = m_parser.value( );
in_link = true;
}
} else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) {
- if( link.substr( 0, 7 ) == "http://" ) {
- m_frontier->addUrl( link );
+ if( link.substr( 0, 7 ) == "http://" ||
+ link.substr( 0, 8 ) == "https://" ) {
+ if( m_filter->filter( link ) ) {
+ m_frontier->addUrl( link );
+ }
} else {
string absLink( s->getBaseUrl( ).str( ) );
absLink.append( link );
- m_frontier->addUrl( absLink );
+ if( m_filter->filter( link ) ) {
+ m_frontier->addUrl( absLink );
+ }
}
link.clear( );
in_link = false;
diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/HTMLLinkExtractProcessor.hpp
index 2777521..9d2c579 100644
--- a/src/HTMLLinkExtractProcessor.hpp
+++ b/src/HTMLLinkExtractProcessor.hpp
@@ -3,17 +3,19 @@
#include "Processor.hpp"
#include "Frontier.hpp"
+#include "URLFilter.hpp"
#include "htmlparser_cpp.h"
class HTMLLinkExtractProcessor : public Processor {
public:
- HTMLLinkExtractProcessor( Frontier *frontier );
+ HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter );
virtual ~HTMLLinkExtractProcessor( );
virtual void process( RewindInputStream *s );
protected:
Frontier *m_frontier;
+ URLFilter *m_filter;
streamhtmlparser::HtmlParser m_parser;
};
diff --git a/src/ProtocolURLFilter.cpp b/src/ProtocolURLFilter.cpp
new file mode 100644
index 0000000..57e042d
--- /dev/null
+++ b/src/ProtocolURLFilter.cpp
@@ -0,0 +1,11 @@
+#include "ProtocolURLFilter.hpp"
+
+ProtocolURLFilter::ProtocolURLFilter( const std::set<std::string> protocols )
+ : m_protocols( protocols )
+{
+}
+
+bool ProtocolURLFilter::filter( const URL &url )
+{
+ return( m_protocols.find( url.protocol( ) ) != m_protocols.end( ) );
+}
diff --git a/src/ProtocolURLFilter.hpp b/src/ProtocolURLFilter.hpp
new file mode 100644
index 0000000..cd05ff9
--- /dev/null
+++ b/src/ProtocolURLFilter.hpp
@@ -0,0 +1,19 @@
+#ifndef __PROTOCOL_URLFILTER_H
+#define __PROTOCOL_URLFILTER_H
+
+#include "URLFilter.hpp"
+
+#include <set>
+
+class ProtocolURLFilter : public URLFilter
+{
+ public:
+ ProtocolURLFilter( const std::set<std::string> protocols );
+
+ bool filter( const URL &url );
+
+ protected:
+ std::set<std::string> m_protocols;
+};
+
+#endif
diff --git a/src/RewindInputStream.hpp b/src/RewindInputStream.hpp
index 9daafe4..92f2961 100644
--- a/src/RewindInputStream.hpp
+++ b/src/RewindInputStream.hpp
@@ -17,6 +17,11 @@ class RewindInputStream : public std::istream {
return m_baseUrl;
}
+ void setBaseUrl( const URL &url )
+ {
+ m_baseUrl = url;
+ }
+
private:
URL m_baseUrl;
};
diff --git a/src/URL.hpp b/src/URL.hpp
index 4031988..9813956 100644
--- a/src/URL.hpp
+++ b/src/URL.hpp
@@ -32,6 +32,26 @@ class URL {
return m_url;
}
+ const string protocol( ) const
+ {
+ return "http";
+ }
+
+ const string domain( ) const
+ {
+ return "www.andreasbaumann.cc";
+ }
+
+ unsigned short port( ) const
+ {
+ return 80;
+ }
+
+ const string path( ) const
+ {
+ return "/";
+ }
+
static URL Null;
bool operator!=( const URL &other ) const {
diff --git a/src/URLFilter.hpp b/src/URLFilter.hpp
index 83cddea..c48307e 100644
--- a/src/URLFilter.hpp
+++ b/src/URLFilter.hpp
@@ -3,8 +3,11 @@
#include "URL.hpp"
-class URLFilter {
+class URLFilter
+{
public:
+ virtual ~URLFilter( ) { };
+
virtual bool filter( const URL &url ) = 0;
};
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index e96a855..2f4e067 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -2,18 +2,37 @@
#include "MemoryFrontier.hpp"
#include "MD5Deduper.hpp"
#include "HTMLLinkExtractProcessor.hpp"
+#include "ChainURLFilter.hpp"
+#include "ProtocolURLFilter.hpp"
+#include "DomainURLFilter.hpp"
+
+#include <set>
+
+using namespace std;
int main( void )
{
Frontier *frontier = new MemoryFrontier( );
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
- Processor *processor = new HTMLLinkExtractProcessor( frontier );
+
+ set<string> protocols;
+ protocols.insert( "http" );
+ protocols.insert( "https" );
+ ProtocolURLFilter protocolFilter( protocols );
+
+ set<string> domains;
+ domains.insert( "www.andreasbaumann.cc" );
+ DomainURLFilter domainFilter( domains );
+
+ ChainURLFilter filters( &protocolFilter, &domainFilter );
+
+ Processor *processor = new HTMLLinkExtractProcessor( frontier, &filters );
LOG( logNOTICE ) << "Crawler started..";
frontier->addUrl( URL( "http://www.andreasbaumann.cc" ) );
-
+
URL url;
while( ( url = frontier->getNextUrl( ) ) != URL::Null ) {
LOG( logINFO ) << "Got URL " << url;