summaryrefslogtreecommitdiff
path: root/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
blob: d855781c54efe32927fcfc30f9277fe4bde9c6f5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#include <string>
#include <algorithm>

#include "SimpleURLNormalizer.hpp"

#include "Unused.hpp"

#ifdef WITH_LUA
#include "tolua.h"
#include "SimpleURLNormalizerLua.hpp"
#include "LuaVM.hpp"
#endif

using namespace std;

SimpleURLNormalizer::SimpleURLNormalizer( )
{
}

SimpleURLNormalizer::~SimpleURLNormalizer( )
{
}

URL SimpleURLNormalizer::parseUrl( const string s )
{
	if( s.empty( ) ) {
		return URL::Null;
	}

	// protocol
	string protocol;
	string::const_iterator protocolStart = s.begin( );
	string::const_iterator protocolEnd = find( protocolStart, s.end( ), ':' );
	if( protocolEnd == s.end( ) ) {
		// no protocol separator ':', not really legal
		return URL::Null;
	}
	protocol = &*protocolEnd;
	if( protocol.length( ) < 3 || protocol.substr( 0, 3 ) != "://" ) {
		// no protocol, not really legal
		return URL::Null;
	}
	protocol = string( protocolStart, protocolEnd );
	protocolEnd += 3;

	// host
	string host;
	string::const_iterator hostStart = protocolEnd;
	string::const_iterator pathStart = find( hostStart, s.end( ), '/' );
	string::const_iterator hostEnd = find( protocolEnd, pathStart, ':' );
	host = string( hostStart, hostEnd );
	
	// port
	unsigned short port = URL::defaultPort( protocol );
	if( hostEnd != s.end( ) && *hostEnd == ':' ) {
		hostEnd++;
		string::const_iterator portEnd = pathStart;
		string portStr = string( hostEnd, portEnd );
		port = (unsigned short)atoi( portStr.c_str( ) );
	}
	
	// path
	string path;
	if( pathStart != s.end( ) ) {
		path = string( pathStart, s.end( ) );
	} else {
		// add trailing slash if path is empty
		path = "/";
	}
	
	// TODO: fragment
	string fragment;
	
	// TODO: query
	string query;
	
	return URL( protocol, host, port, path, query, fragment );
}

/*
 * protocol:
* 
down vote
favorite
2	
I would like to ask if there's any Java package or library that have the standard URL normalization?

5 Components of URL Representation

http://www[dot]example[dot]com:8040/folder/exist?name=sky#head 
scheme: http 
authority: www.example.com:8040
path: /folder/exist
query: ?name=sky
fragment: #head

The 3 types of standard URL normalization 

Syntax-Based Normalization 
Case normalization – convert all letter at scheme and authority components to lower case
Percent-encoded normalization – decode any percent-encoded octet that corresponds to unreserved character, such as %2D for hyphen and %5 for underscore
Path segment normalization – remove dot-segments from the path component, such as ‘.’ and ‘..’

domains:
https://github.com/john-kurkowski/tldextract
* http://stackoverflow.com/questions/569137/how-to-get-domain-name-from-url
* http://stackoverflow.com/questions/2616011/easy-way-to-parse-a-url-in-c-cross-platform
* 
 */

URL SimpleURLNormalizer::normalize( const URL url, const string s )
{
	// See if the URL is parseable, if so it is an absolute URL
	URL absUrl = parseUrl( s );
	if( absUrl != URL::Null ) {
		return absUrl;
	}

	// the new path starts with a slash, so it's absolute, ignore
	// the old path
	if( s[0] == '/' ) {
		return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
	}

	// relative links have path, query and fragment only, try to
	// append them cleverly
	string oldPath = url.path( );
	
	// find out the directory of the base path
	size_t found = oldPath.rfind( "/" );
	if( !found ) {
		// no directory at all, just a file, so the new path
		// is returned
		return URL( url.protocol( ), url.host( ), url.port( ), s, "", "" );
	}
	oldPath.erase( found );

	string path = oldPath;
	path.append( "/" );

	// append the one from s
	path.append( s );
	
	// normalize sequences of "/", "." and ".."
	normalizePath( path );

	return URL( url.protocol( ), url.host( ), url.port( ), path, "", "" );
}

void SimpleURLNormalizer::normalizePath( string &path )
{
	size_t found;

	found = path.find( "./" );
	while( found != string::npos ) {
		path.replace( found, 2, "" );
		found = path.find( "./" );
	}
}

static void initModule( CRAWLER_UNUSED( void *user_data ) )
{
#ifdef WITH_LUA
	LuaVM *luaVm = (LuaVM *)user_data;

	tolua_SimpleURLNormalizer_open( luaVm->handle( ) );
#endif
}

static void destroyModule( void * /* user_data */ )
{
}

REGISTER_MODULE( "simple_urlnormalizer", &initModule, &destroyModule, URLNormalizer, SimpleURLNormalizer )