blob: 213f9a53285637ebdbf5604450bd24a84936d780 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
#include "LibFetchFetcher.hpp"
#include "MemoryFrontier.hpp"
#include "MD5Deduper.hpp"
#include "HTMLLinkExtractProcessor.hpp"
#include "ChainURLFilter.hpp"
#include "ProtocolURLFilter.hpp"
#include "HostURLFilter.hpp"
#include "MemoryURLSeen.hpp"
#include "URLNormalizer.hpp"
#include "ModuleLoader.hpp"
#include <set>
#include <vector>
using namespace std;
int main( void )
{
FILELog::reportingLevel( ) = logINFO;
Frontier *frontier = new MemoryFrontier( );
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
URLSeen *urlSeen = new MemoryURLSeen( );
set<string> protocols;
protocols.insert( "http" );
protocols.insert( "https" );
ProtocolURLFilter protocolFilter( protocols );
set<string> hosts;
hosts.insert( "www.andreasbaumann.cc" );
HostURLFilter hostFilter( hosts );
ChainURLFilter filters( &protocolFilter, &hostFilter );
vector<string> modules;
modules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
modules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
ModuleLoader<URLNormalizer> urlNormalizers( modules );
//URLNormalizer *normalizer = urlNormalizers.create( "simple" );
URLNormalizer *normalizer = urlNormalizers.create( "google" );
Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen );
LOG( logNOTICE ) << "Crawler started..";
frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) );
URL url;
while( ( url = frontier->getNextUrl( ) ) != URL::Null ) {
LOG( logINFO ) << "Got URL " << url;
RewindInputStream *s = fetcher->fetch( url );
if( deduper->contentSeen( url, s ) ) {
LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen";
delete s;
continue;
}
processor->process( s );
delete s;
}
delete processor;
urlNormalizers.destroy( "google", normalizer );
delete urlSeen;
delete deduper;
delete fetcher;
delete frontier;
LOG( logNOTICE ) << "Crawler stopped..";
return 0;
}
|