summaryrefslogtreecommitdiff
path: root/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp
blob: ea049806b8fc0d551875bb96f0cfcda078178ad7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include "GoogleURLNormalizer.hpp"

//#include <string>

#include "url_util.h"
#include "url_canon_stdstring.h"
#include "url_parse.h"

using namespace std;
using namespace url_util;
using namespace url_canon;
using namespace url_parse;

GoogleURLNormalizer::GoogleURLNormalizer( )
{
	Initialize( );
}

GoogleURLNormalizer::~GoogleURLNormalizer( )
{
	Shutdown( );
}

static string componentString( const string &s, const Component &comp )
{
	if( comp.len <= 0 ) {
		return string( );
	} else {
		return string( s, comp.begin, comp.len );
	}
}

URL GoogleURLNormalizer::parseUrl( const string s )
{
	string canonical;
	canonical.reserve( s.size( ) + 32 );
	StdStringCanonOutput output( &canonical );
	Parsed parsed;
	bool success = Canonicalize(
		s.data( ), static_cast<int>( s.length( ) ),
		NULL, &output, &parsed );
	if( !success ) {
		return URL::Null;
	}
	output.Complete( );

	unsigned short port;
	if( parsed.port.len >= 0 ) {
		port = (unsigned short)atoi(
			componentString( canonical, parsed.port ).c_str( ) );
	} else {
		port = URL::defaultPort(
			componentString( canonical, parsed.scheme ) );
	}

	return URL(	componentString( canonical, parsed.scheme ),
			componentString( canonical, parsed.host ),
			port,
			componentString( canonical, parsed.path ),
			componentString( canonical, parsed.query ),
			"" );
}

URL GoogleURLNormalizer::normalize( const URL url, const string s )
{
	string urlstr = url.str( );
	string urlCanonical;
	urlCanonical.reserve( urlstr.size( ) + 32 );
	StdStringCanonOutput urlOutput( &urlCanonical );
	Parsed urlParsed;
	bool success = Canonicalize(
		urlstr.data( ), static_cast<int>( urlstr.length( ) ),
		NULL, &urlOutput, &urlParsed );
	if( !success ) {
		return URL::Null;
	}
	urlOutput.Complete( );

	string canonical;
	canonical.reserve( urlstr.size( ) + s.size( ) + 32 );
	StdStringCanonOutput output( &canonical );
	Parsed parsed;
	success = ResolveRelative(
		urlstr.data( ), static_cast<int>( urlstr.length( ) ), urlParsed,
		s.data( ), static_cast<int>( s.length( ) ),
		NULL, &output, &parsed );
	if( !success ) {
		return URL::Null;
	}
	output.Complete( );

	unsigned short port;
	if( parsed.port.len >= 0 ) {
		port = (unsigned short)atoi(
			componentString( canonical, parsed.port ).c_str( ) );
	} else {
		port = URL::defaultPort(
			componentString( canonical, parsed.scheme ) );
	}

	return URL(	componentString( canonical, parsed.scheme ),
			componentString( canonical, parsed.host ),
			port,
			componentString( canonical, parsed.path ),
			componentString( canonical, parsed.query ),
			"" );
}

REGISTER_MODULE( "google_urlnormalizer", 0, 0, URLNormalizer, GoogleURLNormalizer )