1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
#include <string>
#include <algorithm>
#include "SimpleURLNormalizer.hpp"
#ifdef WITH_LUA
#include "tolua.h"
#include "SimpleURLNormalizerLua.hpp"
#include "LuaVM.hpp"
#endif
using namespace std;
SimpleURLNormalizer::SimpleURLNormalizer( )
{
}
SimpleURLNormalizer::~SimpleURLNormalizer( )
{
}
URL SimpleURLNormalizer::parseUrl( const string s )
{
if( s.empty( ) ) {
return URL::Null;
}
// protocol
string protocol;
string::const_iterator protocolStart = s.begin( );
string::const_iterator protocolEnd = find( protocolStart, s.end( ), ':' );
if( protocolEnd == s.end( ) ) {
// no protocol separator ':', not really legal
return URL::Null;
}
protocol = &*protocolEnd;
if( protocol.length( ) < 3 || protocol.substr( 0, 3 ) != "://" ) {
// no protocol, not really legal
return URL::Null;
}
protocol = string( protocolStart, protocolEnd );
protocolEnd += 3;
// host
string host;
string::const_iterator hostStart = protocolEnd;
string::const_iterator pathStart = find( hostStart, s.end( ), '/' );
string::const_iterator hostEnd = find( protocolEnd, pathStart, ':' );
host = string( hostStart, hostEnd );
// port
unsigned short port = URL::defaultPort( protocol );
if( hostEnd != s.end( ) && *hostEnd == ':' ) {
hostEnd++;
string::const_iterator portEnd = pathStart;
string portStr = string( hostEnd, portEnd );
port = (unsigned short)atoi( portStr.c_str( ) );
}
// path
string path;
if( pathStart != s.end( ) ) {
path = string( pathStart, s.end( ) );
} else {
// add trailing slash if path is empty
path = "/";
}
// TODO: fragment
string fragment;
// TODO: query
string query;
return URL( protocol, host, port, path, query, fragment );
}
/*
* protocol:
*
down vote
favorite
2
I would like to ask if there's any Java package or library that have the standard URL normalization?
5 Components of URL Representation
http://www[dot]example[dot]com:8040/folder/exist?name=sky#head
scheme: http
authority: www.example.com:8040
path: /folder/exist
query: ?name=sky
fragment: #head
The 3 types of standard URL normalization
Syntax-Based Normalization
Case normalization – convert all letter at scheme and authority components to lower case
Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore
Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’
domains:
https://github.com/john-kurkowski/tldextract
* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url
* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform
*
*/
URL SimpleURLNormalizer::normalize( const URL url, const string s )
{
// See if the URL is parseable, if so it is an absolute URL
URL absUrl = parseUrl( s );
if( absUrl != URL::Null ) {
return absUrl;
}
// the new path starts with a slash, so it's absolute, ignore
// the old path
if( s[0] == '/' ) {
return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
}
// relative links have path, query and fragment only, try to
// append them cleverly
string oldPath = url.path( );
// find out the directory of the base path
size_t found = oldPath.rfind( "/" );
if( !found ) {
// no directory at all, just a file, so the new path
// is returned
return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
}
oldPath.erase( found );
string path = oldPath;
path.append( "/" );
// append the one from s
path.append( s );
// normalize sequences of "/", "." and ".."
normalizePath( path );
return URL( url.protocol( ), url.host( ), url.port( ), path, "", "" );
}
void SimpleURLNormalizer::normalizePath( string &path )
{
size_t found;
found = path.find( "./" );
while( found != string::npos ) {
path.replace( found, 2, "" );
found = path.find( "./" );
}
}
static void initModule( void *user_data )
{
#ifdef WITH_LUA
LuaVM *luaVm = (LuaVM *)user_data;
tolua_SimpleURLNormalizer_open( luaVm->handle( ) );
#endif
}
static void destroyModule( void * /* user_data */ )
{
}
REGISTER_MODULE( "simple_urlnormalizer", &initModule, &destroyModule, URLNormalizer, SimpleURLNormalizer )
|