src/crawlingwolf.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

#include "LibFetchFetcher.hpp"
#include "MemoryFrontier.hpp"
#include "MD5Deduper.hpp"
#include "HTMLLinkExtractProcessor.hpp"
#include "ChainURLFilter.hpp"
#include "ProtocolURLFilter.hpp"
#include "HostURLFilter.hpp"
#include "MemoryURLSeen.hpp"
#include "URLNormalizer.hpp"
#include "ModuleLoader.hpp"

#include <set>
#include <vector>

using namespace std;

int main( void )
{
	FILELog::reportingLevel( ) = logINFO;
	
	Frontier *frontier = new MemoryFrontier( );
	Fetcher *fetcher = new LibFetchFetcher( );
	Deduper *deduper = new MD5Deduper( );
	URLSeen *urlSeen = new MemoryURLSeen( );

	set<string> protocols;
	protocols.insert( "http" );
	protocols.insert( "https" );
	ProtocolURLFilter protocolFilter( protocols );
	
	set<string> hosts;
	hosts.insert( "www.andreasbaumann.cc" );
	HostURLFilter hostFilter( hosts );
	
	ChainURLFilter filters( &protocolFilter, &hostFilter );

	vector<string> modules;
	modules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
	modules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
	ModuleLoader<URLNormalizer> urlNormalizers( modules );
	//URLNormalizer *normalizer = urlNormalizers.create( "simple" );
	URLNormalizer *normalizer = urlNormalizers.create( "google" );
	
	Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen );
	
	LOG( logNOTICE ) << "Crawler started..";
	
	frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
	
	URL url;
	while( ( url = frontier->getNextUrl( ) ) != URL::Null ) {
		LOG( logINFO ) << "Got URL " << url;
		RewindInputStream *s = fetcher->fetch( url );
		
		if( deduper->contentSeen( url, s ) ) {
			LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen";
			delete s;
			continue;
		}
		
		processor->process( s );
		
		delete s;
	}
	
	delete processor;
	urlNormalizers.destroy( "google", normalizer );
	delete urlSeen;
	delete deduper;
	delete fetcher;
	delete frontier;

	LOG( logNOTICE ) << "Crawler stopped..";
		
	return 0;
}