summaryrefslogtreecommitdiff
path: root/src/HTMLLinkExtractProcessor.cpp
blob: b71e88ac65b3e929d951e49d34f98e1e3c41ba81 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include "HTMLLinkExtractProcessor.hpp"
#include "Logger.hpp"

#include <string>
#include <cstring>

using namespace std;
using namespace streamhtmlparser;

HTMLLinkExtractProcessor::HTMLLinkExtractProcessor( Frontier *frontier, URLFilter *filter )
	: m_frontier( frontier ), m_filter( filter ), m_parser( )
{
}

HTMLLinkExtractProcessor::~HTMLLinkExtractProcessor( )
{
}

void HTMLLinkExtractProcessor::process( RewindInputStream *s )
{
	string link;
	char buf[1] = {0};
	bool in_link = false;
		
	while( s->good( ) && !s->eof( ) ) {
		buf[0] = s->get( );
		m_parser.Parse( buf, 1 );
				
		if( m_parser.state( ) == HtmlParser::STATE_VALUE && m_parser.tag( ) != NULL && m_parser.attribute( ) != NULL && m_parser.value( ) != NULL ) {
			if( 	strcmp( m_parser.tag( ), "base" ) == 0 &&
				strcmp( m_parser.attribute( ), "href" ) == 0 ) {
				s->setBaseUrl( string( m_parser.value( ) ) );
			}
			if( 	( ( 	strcmp( m_parser.tag( ), "a" ) == 0 ||
					strcmp( m_parser.tag( ), "area" ) == 0 ||
					strcmp( m_parser.tag( ), "link" ) == 0 ) &&
					strcmp( m_parser.attribute( ), "href" ) == 0 ) || 
				( (	strcmp( m_parser.tag( ), "img" ) == 0 ||
					strcmp( m_parser.tag( ), "frame" ) == 0 ||
					strcmp( m_parser.tag( ), "iframe" ) == 0 ||
					strcmp( m_parser.tag( ), "embed" ) == 0 ) &&
					strcmp( m_parser.attribute( ), "src" ) == 0 )
				) {				
				link = m_parser.value( );
				in_link = true;
			}
		} else if( in_link && m_parser.state( ) == HtmlParser::STATE_TAG ) {
			if( 	link.substr( 0, 7 ) == "http://" ||
				link.substr( 0, 8 ) == "https://" ) {
				if( m_filter->filter( link ) ) {
					m_frontier->addUrl( link );
				}
			} else {
				string absLink( s->getBaseUrl( ).str( ) );
				absLink.append( link );
				if( m_filter->filter( absLink ) ) {
					m_frontier->addUrl( absLink );
				}
			}
			link.clear( );
			in_link = false;
		} else if( m_parser.state( ) == HtmlParser::STATE_ERROR ) {
			// TODO: proper error handling
			cerr << endl << "ERROR at " << endl;
			m_parser.Reset( );
			return;
		}
	}

	m_parser.Reset( );
}