diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-29 11:39:56 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-29 11:39:56 +0200 |
commit | ae148a31b891c760eb08a6f9a2c279f6d7dd6ec2 (patch) | |
tree | 2319b9055a49554ea98455da1afcf69784216147 | |
parent | ff98cc74fd06ef167eb2404ce93051d92dd75caf (diff) | |
download | crawler-ae148a31b891c760eb08a6f9a2c279f6d7dd6ec2.tar.gz crawler-ae148a31b891c760eb08a6f9a2c279f6d7dd6ec2.tar.bz2 |
started to add simple parseUrl implementation
-rw-r--r-- | src/SimpleURLNormalizer.cpp | 67 | ||||
-rw-r--r-- | src/URL.hpp | 28 | ||||
-rw-r--r-- | src/URLNormalizer.hpp | 6 | ||||
-rw-r--r-- | tests/url/GNUmakefile | 9 | ||||
-rwxr-xr-x | tests/url/exec_test | 8 | ||||
-rw-r--r-- | tests/url/test1.MUST | 7 | ||||
-rw-r--r-- | tests/url/test1.cpp | 10 | ||||
-rw-r--r-- | tests/url/test100.MUST | 0 | ||||
-rw-r--r-- | tests/url/test2.MUST | 4 | ||||
-rw-r--r-- | tests/url/test3.MUST | 6 | ||||
-rw-r--r-- | tests/url/test4.MUST | 6 | ||||
-rw-r--r-- | tests/url/test5.MUST | 6 | ||||
-rw-r--r-- | tests/url/test6.MUST | 6 |
13 files changed, 135 insertions, 28 deletions
diff --git a/src/SimpleURLNormalizer.cpp b/src/SimpleURLNormalizer.cpp index 7457425..a74d9e4 100644 --- a/src/SimpleURLNormalizer.cpp +++ b/src/SimpleURLNormalizer.cpp @@ -1,19 +1,72 @@ +#include <string> +#include <algorithm> + #include "SimpleURLNormalizer.hpp" +using namespace std; + SimpleURLNormalizer::SimpleURLNormalizer( ) { } URL SimpleURLNormalizer::parseUrl( const string s ) { - (void)s; + if( s.empty( ) ) { + return URL::Null; + } + + // protocol + string protocol; + string::const_iterator protocolStart = s.begin( ); + string::const_iterator protocolEnd = find( protocolStart, s.end( ), ':' ); + if( protocolStart == s.end( ) ) { + // no protocol separator ':', not really legal + return URL::Null; + } + protocol = &*protocolEnd; + if( protocol.length( ) < 3 || protocol.substr( 0, 3 ) != "://" ) { + // no protocol, not really legal + return URL::Null; + } + protocol = string( protocolStart, protocolEnd ); + protocolEnd += 3; + + // host + string host; + string::const_iterator hostStart = protocolEnd; + string::const_iterator pathStart = find( hostStart, s.end( ), '/' ); + string::const_iterator hostEnd = find( protocolEnd, pathStart, ':' ); + host = string( hostStart, hostEnd ); + + // port + unsigned short port = URL::defaultPort( protocol ); + if( hostEnd != s.end( ) && *hostEnd == ':' ) { + hostEnd++; + string::const_iterator portEnd = pathStart; + string portStr = string( hostEnd, portEnd ); + port = (unsigned short)atoi( portStr.c_str( ) ); + } + + // path + string path; + if( pathStart != s.end( ) ) { + path = string( pathStart, s.end( ) ); + } else { + // add trailing slash if path is empty + path = "/"; + } + + // TODO: fragment + string fragment; + + // TODO: query + + return URL( protocol, host, port, path, fragment ); +} + /* * protocol: - * return m_url.substr( 0, m_url.find( ':' ) ); - * domain: - * size_t found = m_url.find( "://" ); - return m_url.substr( found+3, m_url.find( '/', found+3 ) - found-3 ); - +* down vote favorite 2 @@ -44,8 +97,6 @@ Protocol-Based Normalization Only appropriate when the results of accessing the resources are equivalent For example, example.com/data is directed to example.com/data/ by origin server */ - return URL::Null; -} URL SimpleURLNormalizer::normalize( const URL url, const string s ) { diff --git a/src/URL.hpp b/src/URL.hpp index 4cb7f1b..58e1d0a 100644 --- a/src/URL.hpp +++ b/src/URL.hpp @@ -26,6 +26,11 @@ class URL { { } + URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _fragment ) + : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_fragment( _fragment ) + { + } + URL& operator=( const URL& u ) { if( this != &u ) { this->m_protocol = u.m_protocol; @@ -49,6 +54,7 @@ class URL { const string domain( ) const { + // TODO: implement using heuristics and top level domain lists return m_host; } @@ -85,6 +91,15 @@ class URL { m_fragment != other.m_fragment ); } + bool operator==( const URL &other ) const + { + return( m_protocol == other.m_protocol && + m_host == other.m_host && + m_port == other.m_port && + m_path == other.m_path && + m_fragment == other.m_fragment ); + } + bool operator<( const URL &other ) const { return( m_protocol < other.m_protocol && @@ -97,7 +112,7 @@ class URL { template< typename CharT, typename TraitsT > friend basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u ); - unsigned short defaultPort( const std::string p ) const + static unsigned short defaultPort( const std::string p ) { if( p == "http" ) return 80; else if( p == "https" ) return 443; @@ -108,11 +123,18 @@ class URL { template< typename CharT, typename TraitsT > inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u ) { + if( u.protocol( ).empty( ) ) { + return s; + } + s << u.protocol( ) << "://" << u.host( ); - if( u.port( ) != u.defaultPort( u.protocol( ) ) ) { + if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) { s << ":" << u.port( ); } - s << u.path( ) << "#" << u.fragment( ); + s << u.path( ); + if( !u.fragment( ).empty( ) ) { + s << "#" << u.fragment( ); + } return s; } diff --git a/src/URLNormalizer.hpp b/src/URLNormalizer.hpp index 87ff945..af1781a 100644 --- a/src/URLNormalizer.hpp +++ b/src/URLNormalizer.hpp @@ -1,15 +1,17 @@ #ifndef __URLNORMALIZER_H #define __URLNORMALIZER_H +#include <string> + #include "URL.hpp" class URLNormalizer { public: virtual ~URLNormalizer( ) { }; - virtual URL parseUrl( const string s ) = 0; + virtual URL parseUrl( const std::string s ) = 0; - virtual URL normalize( const URL url, const string s ) = 0; + virtual URL normalize( const URL url, const std::string s ) = 0; }; #endif diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile index 81d7a0e..be50549 100644 --- a/tests/url/GNUmakefile +++ b/tests/url/GNUmakefile @@ -27,5 +27,10 @@ local_clean: local_distclean: local_test: - @-./exec_test test1 "output normal URL" http://www.andreasbaumann.cc/index.html - @-./exec_test test2 "normalize a relative URL" http://www.andreasbaumann.cc/index.html /software.html + @-./exec_test test1 test1 "parse illegal protocol" www.andreasbaumann.cc + @-./exec_test test1 test2 "parse normal start URL without slash" http://www.andreasbaumann.cc + @-./exec_test test1 test3 "parse normal start URL with slash" http://www.andreasbaumann.cc/ + @-./exec_test test1 test4 "parse normal URL" http://www.andreasbaumann.cc/index.html + @-./exec_test test1 test5 "parse normal URL with default port" http://www.andreasbaumann.cc:80/index.html + @-./exec_test test1 test6 "parse normal URL with non-standard port" http://www.andreasbaumann.cc:8080/index.html + @-./exec_test test2 test100 "normalize a relative URL" http://www.andreasbaumann.cc/index.html /software.html diff --git a/tests/url/exec_test b/tests/url/exec_test index 8628c2d..92b656f 100755 --- a/tests/url/exec_test +++ b/tests/url/exec_test @@ -2,9 +2,11 @@ BINARY=$1 shift +ID=$1 +shift TITLE=$1 shift -printf "$BINARY: $TITLE .. " -./$BINARY $* >$BINARY.RES 2>&1 -diff $BINARY.MUST $BINARY.RES > $BINARY.DIFF && printf "OK\n" || printf "ERROR\n" +printf "$ID: $TITLE .. " +./$BINARY $* >$ID.RES 2>&1 +diff $ID.MUST $ID.RES > $ID.DIFF && printf "OK\n" || printf "ERROR\n" diff --git a/tests/url/test1.MUST b/tests/url/test1.MUST index 38a3a27..1b6af48 100644 --- a/tests/url/test1.MUST +++ b/tests/url/test1.MUST @@ -1,6 +1 @@ -protocol: http -port: 80 -domain: www.andreasbaumann.cc -path: /index.html -fragment: - +Illegal URL! diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp index 677bf10..ac6a086 100644 --- a/tests/url/test1.cpp +++ b/tests/url/test1.cpp @@ -18,13 +18,19 @@ int main( int argc, char *argv[] ) URLNormalizer *normalizer = new SimpleURLNormalizer( ); URL url = normalizer->parseUrl( urlstring ); delete normalizer; + + if( url == URL::Null ) { + cerr << "Illegal URL!" << endl; + return 1; + } cout << "protocol: " << url.protocol( ) << endl << "port: " << url.port( ) << endl << "domain: " << url.domain( ) << endl << "path: " << url.path( ) << endl - << "fragment: " << url.fragment( ) << endl - << endl; + << "fragment: " << url.fragment( ) << endl; + + cout << "URL: " << url << endl; return 0; } diff --git a/tests/url/test100.MUST b/tests/url/test100.MUST new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/url/test100.MUST diff --git a/tests/url/test2.MUST b/tests/url/test2.MUST index 4718213..3172868 100644 --- a/tests/url/test2.MUST +++ b/tests/url/test2.MUST @@ -1,6 +1,6 @@ protocol: http port: 80 domain: www.andreasbaumann.cc -path: /software.html +path: / fragment: - +URL: http://www.andreasbaumann.cc/ diff --git a/tests/url/test3.MUST b/tests/url/test3.MUST new file mode 100644 index 0000000..3172868 --- /dev/null +++ b/tests/url/test3.MUST @@ -0,0 +1,6 @@ +protocol: http +port: 80 +domain: www.andreasbaumann.cc +path: / +fragment: +URL: http://www.andreasbaumann.cc/ diff --git a/tests/url/test4.MUST b/tests/url/test4.MUST new file mode 100644 index 0000000..f7d1220 --- /dev/null +++ b/tests/url/test4.MUST @@ -0,0 +1,6 @@ +protocol: http +port: 80 +domain: www.andreasbaumann.cc +path: /index.html +fragment: +URL: http://www.andreasbaumann.cc/index.html diff --git a/tests/url/test5.MUST b/tests/url/test5.MUST new file mode 100644 index 0000000..f7d1220 --- /dev/null +++ b/tests/url/test5.MUST @@ -0,0 +1,6 @@ +protocol: http +port: 80 +domain: www.andreasbaumann.cc +path: /index.html +fragment: +URL: http://www.andreasbaumann.cc/index.html diff --git a/tests/url/test6.MUST b/tests/url/test6.MUST new file mode 100644 index 0000000..5ed0d82 --- /dev/null +++ b/tests/url/test6.MUST @@ -0,0 +1,6 @@ +protocol: http +port: 8080 +domain: www.andreasbaumann.cc +path: /index.html +fragment: +URL: http://www.andreasbaumann.cc:8080/index.html |