summaryrefslogtreecommitdiff
path: root/src/crawlingwolf.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/crawlingwolf.cpp')
-rw-r--r--src/crawlingwolf.cpp23
1 files changed, 21 insertions, 2 deletions
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index e96a855..2f4e067 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -2,18 +2,37 @@
#include "MemoryFrontier.hpp"
#include "MD5Deduper.hpp"
#include "HTMLLinkExtractProcessor.hpp"
+#include "ChainURLFilter.hpp"
+#include "ProtocolURLFilter.hpp"
+#include "DomainURLFilter.hpp"
+
+#include <set>
+
+using namespace std;
int main( void )
{
Frontier *frontier = new MemoryFrontier( );
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
- Processor *processor = new HTMLLinkExtractProcessor( frontier );
+
+ set<string> protocols;
+ protocols.insert( "http" );
+ protocols.insert( "https" );
+ ProtocolURLFilter protocolFilter( protocols );
+
+ set<string> domains;
+ domains.insert( "www.andreasbaumann.cc" );
+ DomainURLFilter domainFilter( domains );
+
+ ChainURLFilter filters( &protocolFilter, &domainFilter );
+
+ Processor *processor = new HTMLLinkExtractProcessor( frontier, &filters );
LOG( logNOTICE ) << "Crawler started..";
frontier->addUrl( URL( "http://www.andreasbaumann.cc" ) );
-
+
URL url;
while( ( url = frontier->getNextUrl( ) ) != URL::Null ) {
LOG( logINFO ) << "Got URL " << url;