summaryrefslogtreecommitdiff
path: root/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp
blob: cd02bcb75958d8ed8229cbb2b70a0ce1de7c1966 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include "HTMLLinkExtractProcessor.hpp"
#include "Logger.hpp"

#include <string>
#include <cstring>

using namespace std;
using namespace streamhtmlparser;

HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( URLNormalizer *normalizer, Frontier *frontier, URLFilter *filter, URLSeen *urlSeen )
	: m_normalizer( normalizer ), m_frontier( frontier ), m_filter( filter ), m_urlSeen( urlSeen ),
	m_parser( ), m_baseUrl( URL::Null )
{
}

HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( )
{
}

void HTMLLinkExtractProcessor::process( RewindInputStream *s )
{
	string link;
	char buf[1] = {0};
	bool in_link = false;
	
	m_baseUrl = s->getBaseUrl( );
	
	while( s->good( ) && !s->eof( ) ) {
		buf[0] = s->get( );
		m_parser.Parse( buf, 1 );
				
		if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) {
			if( 	strcmp( m_parser.tag( ), "base" ) == 0 &&
				strcmp( m_parser.attribute( ), "href" ) == 0 ) {
				m_baseUrl = m_normalizer->parseUrl( m_parser.value( ) );
			}
			if( 	( ( 	strcmp( m_parser.tag( ), "a" ) == 0 ||
					strcmp( m_parser.tag( ), "area" ) == 0 ||
					strcmp( m_parser.tag( ), "link" ) == 0 ) &&
					strcmp( m_parser.attribute( ), "href" ) == 0 ) || 
				( (	strcmp( m_parser.tag( ), "img" ) == 0 ||
					strcmp( m_parser.tag( ), "frame" ) == 0 ||
					strcmp( m_parser.tag( ), "iframe" ) == 0 ||
					strcmp( m_parser.tag( ), "embed" ) == 0 ) &&
					strcmp( m_parser.attribute( ), "src" ) == 0 )
				) {				
				link = m_parser.value( );
				in_link = true;
			}
		} else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) {
			URL absoluteLink = m_normalizer->normalize( m_baseUrl, link );
			if( m_filter->filter( absoluteLink ) ) {
				if( !m_urlSeen->seen( absoluteLink ) ) {
					m_frontier->addUrl( absoluteLink );
				}
			}
			
			link.clear( );
			in_link = false;
		} else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) {
			// TODO: proper error handling
			cerr << endl << "ERROR at " << endl;
			m_parser.Reset( );
			return;
		}
	}

	m_parser.Reset( );
}

REGISTER_MODULE_4( "htmllinkextract_processor", 0, 0, Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )