blob: 0dca3f2d25f4624cdd540fdd74b8132603271cbc (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
#include "LibFetchFetcher.hpp"
#include "MemoryFrontier.hpp"
#include "MD5Deduper.hpp"
#include "HTMLLinkExtractProcessor.hpp"
#include "Logger.hpp"
int main( void )
{
Frontier *frontier = new MemoryFrontier( );
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
Processor *processor = new HTMLLinkExtractProcessor( frontier );
LOG( logNOTICE ) << "Crawler started..";
frontier->addUrl( URL( "http://www.andreasbaumann.cc" ) );
URL url;
while( ( url = frontier->getNextUrl( ) ) != URL::Null ) {
LOG( logINFO ) << "Got URL " << url;
RewindInputStream *s = fetcher->fetch( url );
if( deduper->contentSeen( url, s ) ) {
LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen";
delete s;
continue;
}
processor->process( s );
delete s;
}
delete processor;
delete deduper;
delete fetcher;
delete frontier;
LOG( logNOTICE ) << "Crawler stopped..";
return 0;
}
|