summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-29 11:39:56 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-29 11:39:56 +0200
commitae148a31b891c760eb08a6f9a2c279f6d7dd6ec2 (patch)
tree2319b9055a49554ea98455da1afcf69784216147
parentff98cc74fd06ef167eb2404ce93051d92dd75caf (diff)
downloadcrawler-ae148a31b891c760eb08a6f9a2c279f6d7dd6ec2.tar.gz
crawler-ae148a31b891c760eb08a6f9a2c279f6d7dd6ec2.tar.bz2
started to add simple parseUrl implementation
-rw-r--r--src/SimpleURLNormalizer.cpp67
-rw-r--r--src/URL.hpp28
-rw-r--r--src/URLNormalizer.hpp6
-rw-r--r--tests/url/GNUmakefile9
-rwxr-xr-xtests/url/exec_test8
-rw-r--r--tests/url/test1.MUST7
-rw-r--r--tests/url/test1.cpp10
-rw-r--r--tests/url/test100.MUST0
-rw-r--r--tests/url/test2.MUST4
-rw-r--r--tests/url/test3.MUST6
-rw-r--r--tests/url/test4.MUST6
-rw-r--r--tests/url/test5.MUST6
-rw-r--r--tests/url/test6.MUST6
13 files changed, 135 insertions, 28 deletions
diff --git a/src/SimpleURLNormalizer.cpp b/src/SimpleURLNormalizer.cpp
index 7457425..a74d9e4 100644
--- a/src/SimpleURLNormalizer.cpp
+++ b/src/SimpleURLNormalizer.cpp
@@ -1,19 +1,72 @@
+#include <string>
+#include <algorithm>
+
#include "SimpleURLNormalizer.hpp"
+using namespace std;
+
SimpleURLNormalizer::SimpleURLNormalizer( )
{
}
URL SimpleURLNormalizer::parseUrl( const string s )
{
- (void)s;
+ if( s.empty( ) ) {
+ return URL::Null;
+ }
+
+ // protocol
+ string protocol;
+ string::const_iterator protocolStart = s.begin( );
+ string::const_iterator protocolEnd = find( protocolStart, s.end( ), ':' );
+ if( protocolStart == s.end( ) ) {
+ // no protocol separator ':', not really legal
+ return URL::Null;
+ }
+ protocol = &*protocolEnd;
+ if( protocol.length( ) < 3 || protocol.substr( 0, 3 ) != "://" ) {
+ // no protocol, not really legal
+ return URL::Null;
+ }
+ protocol = string( protocolStart, protocolEnd );
+ protocolEnd += 3;
+
+ // host
+ string host;
+ string::const_iterator hostStart = protocolEnd;
+ string::const_iterator pathStart = find( hostStart, s.end( ), '/' );
+ string::const_iterator hostEnd = find( protocolEnd, pathStart, ':' );
+ host = string( hostStart, hostEnd );
+
+ // port
+ unsigned short port = URL::defaultPort( protocol );
+ if( hostEnd != s.end( ) && *hostEnd == ':' ) {
+ hostEnd++;
+ string::const_iterator portEnd = pathStart;
+ string portStr = string( hostEnd, portEnd );
+ port = (unsigned short)atoi( portStr.c_str( ) );
+ }
+
+ // path
+ string path;
+ if( pathStart != s.end( ) ) {
+ path = string( pathStart, s.end( ) );
+ } else {
+ // add trailing slash if path is empty
+ path = "/";
+ }
+
+ // TODO: fragment
+ string fragment;
+
+ // TODO: query
+
+ return URL( protocol, host, port, path, fragment );
+}
+
/*
* protocol:
- * return m_url.substr( 0, m_url.find( ':' ) );
- * domain:
- * size_t found = m_url.find( "://" );
- return m_url.substr( found+3, m_url.find( '/', found+3 ) - found-3 );
-
+*
down vote
favorite
2
@@ -44,8 +97,6 @@ Protocol-Based Normalization
Only appropriate when the results of accessing the resources are equivalent
For example, example.com/data is directed to example.com/data/ by origin server
*/
- return URL::Null;
-}
URL SimpleURLNormalizer::normalize( const URL url, const string s )
{
diff --git a/src/URL.hpp b/src/URL.hpp
index 4cb7f1b..58e1d0a 100644
--- a/src/URL.hpp
+++ b/src/URL.hpp
@@ -26,6 +26,11 @@ class URL {
{
}
+ URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _fragment )
+ : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_fragment( _fragment )
+ {
+ }
+
URL& operator=( const URL& u ) {
if( this != &u ) {
this->m_protocol = u.m_protocol;
@@ -49,6 +54,7 @@ class URL {
const string domain( ) const
{
+ // TODO: implement using heuristics and top level domain lists
return m_host;
}
@@ -85,6 +91,15 @@ class URL {
m_fragment != other.m_fragment );
}
+ bool operator==( const URL &other ) const
+ {
+ return( m_protocol == other.m_protocol &&
+ m_host == other.m_host &&
+ m_port == other.m_port &&
+ m_path == other.m_path &&
+ m_fragment == other.m_fragment );
+ }
+
bool operator<( const URL &other ) const
{
return( m_protocol < other.m_protocol &&
@@ -97,7 +112,7 @@ class URL {
template< typename CharT, typename TraitsT > friend
basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u );
- unsigned short defaultPort( const std::string p ) const
+ static unsigned short defaultPort( const std::string p )
{
if( p == "http" ) return 80;
else if( p == "https" ) return 443;
@@ -108,11 +123,18 @@ class URL {
template< typename CharT, typename TraitsT >
inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u ) {
+ if( u.protocol( ).empty( ) ) {
+ return s;
+ }
+
s << u.protocol( ) << "://" << u.host( );
- if( u.port( ) != u.defaultPort( u.protocol( ) ) ) {
+ if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) {
s << ":" << u.port( );
}
- s << u.path( ) << "#" << u.fragment( );
+ s << u.path( );
+ if( !u.fragment( ).empty( ) ) {
+ s << "#" << u.fragment( );
+ }
return s;
}
diff --git a/src/URLNormalizer.hpp b/src/URLNormalizer.hpp
index 87ff945..af1781a 100644
--- a/src/URLNormalizer.hpp
+++ b/src/URLNormalizer.hpp
@@ -1,15 +1,17 @@
#ifndef __URLNORMALIZER_H
#define __URLNORMALIZER_H
+#include <string>
+
#include "URL.hpp"
class URLNormalizer {
public:
virtual ~URLNormalizer( ) { };
- virtual URL parseUrl( const string s ) = 0;
+ virtual URL parseUrl( const std::string s ) = 0;
- virtual URL normalize( const URL url, const string s ) = 0;
+ virtual URL normalize( const URL url, const std::string s ) = 0;
};
#endif
diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile
index 81d7a0e..be50549 100644
--- a/tests/url/GNUmakefile
+++ b/tests/url/GNUmakefile
@@ -27,5 +27,10 @@ local_clean:
local_distclean:
local_test:
- @-./exec_test test1 "output normal URL" http://www.andreasbaumann.cc/index.html
- @-./exec_test test2 "normalize a relative URL" http://www.andreasbaumann.cc/index.html /software.html
+ @-./exec_test test1 test1 "parse illegal protocol" www.andreasbaumann.cc
+ @-./exec_test test1 test2 "parse normal start URL without slash" http://www.andreasbaumann.cc
+ @-./exec_test test1 test3 "parse normal start URL with slash" http://www.andreasbaumann.cc/
+ @-./exec_test test1 test4 "parse normal URL" http://www.andreasbaumann.cc/index.html
+ @-./exec_test test1 test5 "parse normal URL with default port" http://www.andreasbaumann.cc:80/index.html
+ @-./exec_test test1 test6 "parse normal URL with non-standard port" http://www.andreasbaumann.cc:8080/index.html
+ @-./exec_test test2 test100 "normalize a relative URL" http://www.andreasbaumann.cc/index.html /software.html
diff --git a/tests/url/exec_test b/tests/url/exec_test
index 8628c2d..92b656f 100755
--- a/tests/url/exec_test
+++ b/tests/url/exec_test
@@ -2,9 +2,11 @@
BINARY=$1
shift
+ID=$1
+shift
TITLE=$1
shift
-printf "$BINARY: $TITLE .. "
-./$BINARY $* >$BINARY.RES 2>&1
-diff $BINARY.MUST $BINARY.RES > $BINARY.DIFF && printf "OK\n" || printf "ERROR\n"
+printf "$ID: $TITLE .. "
+./$BINARY $* >$ID.RES 2>&1
+diff $ID.MUST $ID.RES > $ID.DIFF && printf "OK\n" || printf "ERROR\n"
diff --git a/tests/url/test1.MUST b/tests/url/test1.MUST
index 38a3a27..1b6af48 100644
--- a/tests/url/test1.MUST
+++ b/tests/url/test1.MUST
@@ -1,6 +1 @@
-protocol: http
-port: 80
-domain: www.andreasbaumann.cc
-path: /index.html
-fragment:
-
+Illegal URL!
diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp
index 677bf10..ac6a086 100644
--- a/tests/url/test1.cpp
+++ b/tests/url/test1.cpp
@@ -18,13 +18,19 @@ int main( int argc, char *argv[] )
URLNormalizer *normalizer = new SimpleURLNormalizer( );
URL url = normalizer->parseUrl( urlstring );
delete normalizer;
+
+ if( url == URL::Null ) {
+ cerr << "Illegal URL!" << endl;
+ return 1;
+ }
cout << "protocol: " << url.protocol( ) << endl
<< "port: " << url.port( ) << endl
<< "domain: " << url.domain( ) << endl
<< "path: " << url.path( ) << endl
- << "fragment: " << url.fragment( ) << endl
- << endl;
+ << "fragment: " << url.fragment( ) << endl;
+
+ cout << "URL: " << url << endl;
return 0;
}
diff --git a/tests/url/test100.MUST b/tests/url/test100.MUST
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/url/test100.MUST
diff --git a/tests/url/test2.MUST b/tests/url/test2.MUST
index 4718213..3172868 100644
--- a/tests/url/test2.MUST
+++ b/tests/url/test2.MUST
@@ -1,6 +1,6 @@
protocol: http
port: 80
domain: www.andreasbaumann.cc
-path: /software.html
+path: /
fragment:
-
+URL: http://www.andreasbaumann.cc/
diff --git a/tests/url/test3.MUST b/tests/url/test3.MUST
new file mode 100644
index 0000000..3172868
--- /dev/null
+++ b/tests/url/test3.MUST
@@ -0,0 +1,6 @@
+protocol: http
+port: 80
+domain: www.andreasbaumann.cc
+path: /
+fragment:
+URL: http://www.andreasbaumann.cc/
diff --git a/tests/url/test4.MUST b/tests/url/test4.MUST
new file mode 100644
index 0000000..f7d1220
--- /dev/null
+++ b/tests/url/test4.MUST
@@ -0,0 +1,6 @@
+protocol: http
+port: 80
+domain: www.andreasbaumann.cc
+path: /index.html
+fragment:
+URL: http://www.andreasbaumann.cc/index.html
diff --git a/tests/url/test5.MUST b/tests/url/test5.MUST
new file mode 100644
index 0000000..f7d1220
--- /dev/null
+++ b/tests/url/test5.MUST
@@ -0,0 +1,6 @@
+protocol: http
+port: 80
+domain: www.andreasbaumann.cc
+path: /index.html
+fragment:
+URL: http://www.andreasbaumann.cc/index.html
diff --git a/tests/url/test6.MUST b/tests/url/test6.MUST
new file mode 100644
index 0000000..5ed0d82
--- /dev/null
+++ b/tests/url/test6.MUST
@@ -0,0 +1,6 @@
+protocol: http
+port: 8080
+domain: www.andreasbaumann.cc
+path: /index.html
+fragment:
+URL: http://www.andreasbaumann.cc:8080/index.html