1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
#include "GoogleURLNormalizer.hpp"
//#include <string>
#include "url_util.h"
#include "url_canon_stdstring.h"
#include "url_parse.h"
#ifdef WITH_LUA
#include "tolua.h"
#include "GoogleURLNormalizerLua.hpp"
#include "LuaVM.hpp"
#endif
using namespace std;
using namespace url_util;
using namespace url_canon;
using namespace url_parse;
GoogleURLNormalizer::GoogleURLNormalizer( )
{
Initialize( );
}
GoogleURLNormalizer::~GoogleURLNormalizer( )
{
Shutdown( );
}
static string componentString( const string &s, const Component &comp )
{
if( comp.len <= 0 ) {
return string( );
} else {
return string( s, comp.begin, comp.len );
}
}
URL GoogleURLNormalizer::parseUrl( const string s )
{
string canonical;
canonical.reserve( s.size( ) + 32 );
StdStringCanonOutput output( &canonical );
Parsed parsed;
bool success = Canonicalize(
s.data( ), static_cast<int>( s.length( ) ),
NULL, &output, &parsed );
if( !success ) {
return URL::Null;
}
output.Complete( );
unsigned short port;
if( parsed.port.len >= 0 ) {
port = (unsigned short)atoi(
componentString( canonical, parsed.port ).c_str( ) );
} else {
port = URL::defaultPort(
componentString( canonical, parsed.scheme ) );
}
return URL( componentString( canonical, parsed.scheme ),
componentString( canonical, parsed.host ),
port,
componentString( canonical, parsed.path ),
componentString( canonical, parsed.query ),
"" );
}
URL GoogleURLNormalizer::normalize( const URL url, const string s )
{
string urlstr = url.str( );
string urlCanonical;
urlCanonical.reserve( urlstr.size( ) + 32 );
StdStringCanonOutput urlOutput( &urlCanonical );
Parsed urlParsed;
bool success = Canonicalize(
urlstr.data( ), static_cast<int>( urlstr.length( ) ),
NULL, &urlOutput, &urlParsed );
if( !success ) {
return URL::Null;
}
urlOutput.Complete( );
string canonical;
canonical.reserve( urlstr.size( ) + s.size( ) + 32 );
StdStringCanonOutput output( &canonical );
Parsed parsed;
success = ResolveRelative(
urlstr.data( ), static_cast<int>( urlstr.length( ) ), urlParsed,
s.data( ), static_cast<int>( s.length( ) ),
NULL, &output, &parsed );
if( !success ) {
return URL::Null;
}
output.Complete( );
unsigned short port;
if( parsed.port.len >= 0 ) {
port = (unsigned short)atoi(
componentString( canonical, parsed.port ).c_str( ) );
} else {
port = URL::defaultPort(
componentString( canonical, parsed.scheme ) );
}
return URL( componentString( canonical, parsed.scheme ),
componentString( canonical, parsed.host ),
port,
componentString( canonical, parsed.path ),
componentString( canonical, parsed.query ),
"" );
}
static void initModule( void *user_data )
{
#ifdef WITH_LUA
LuaVM *luaVm = (LuaVM *)user_data;
tolua_GoogleURLNormalizer_open( luaVm->handle( ) );
#endif
}
static void destroyModule( void * /* user_data */ )
{
}
REGISTER_MODULE( "google_urlnormalizer", &initModule, &destroyModule, URLNormalizer, GoogleURLNormalizer )
|