diff options
-rw-r--r-- | src/GoogleURLNormalizer.cpp | 47 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 4 | ||||
-rw-r--r-- | tests/googleurl/GNUmakefile | 17 | ||||
-rwxr-xr-x | tests/googleurl/exec_test | 12 | ||||
-rw-r--r-- | tests/googleurl/test1.MUST | 1 | ||||
-rw-r--r-- | tests/googleurl/test100.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test101.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test102.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test103.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test104.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test105.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test2.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test2.cpp | 37 | ||||
-rw-r--r-- | tests/googleurl/test3.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test4.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test5.MUST | 7 | ||||
-rw-r--r-- | tests/googleurl/test6.MUST | 7 |
17 files changed, 186 insertions, 9 deletions
diff --git a/src/GoogleURLNormalizer.cpp b/src/GoogleURLNormalizer.cpp index c49a831..46f1581 100644 --- a/src/GoogleURLNormalizer.cpp +++ b/src/GoogleURLNormalizer.cpp @@ -32,10 +32,6 @@ string GoogleURLNormalizer::componentString( const string &s, const Component &c URL GoogleURLNormalizer::parseUrl( const string s ) { - if( s.empty( ) ) { - return URL::Null; - } - string canonical; canonical.reserve( s.size( ) + 32 ); StdStringCanonOutput output( &canonical ); @@ -66,7 +62,44 @@ URL GoogleURLNormalizer::parseUrl( const string s ) URL GoogleURLNormalizer::normalize( const URL url, const string s ) { - (void)url; - (void)s; - return URL::Null; + string urlstr = url.str( ); + string urlCanonical; + urlCanonical.reserve( urlstr.size( ) + 32 ); + StdStringCanonOutput urlOutput( &urlCanonical ); + Parsed urlParsed; + bool success = Canonicalize( + urlstr.data( ), static_cast<int>( urlstr.length( ) ), + NULL, &urlOutput, &urlParsed ); + if( !success ) { + return URL::Null; + } + urlOutput.Complete( ); + + string canonical; + canonical.reserve( urlstr.size( ) + s.size( ) + 32 ); + StdStringCanonOutput output( &canonical ); + Parsed parsed; + success = ResolveRelative( + urlstr.data( ), static_cast<int>( urlstr.length( ) ), urlParsed, + s.data( ), static_cast<int>( s.length( ) ), + NULL, &output, &parsed ); + if( !success ) { + return URL::Null; + } + output.Complete( ); + + unsigned short port; + if( parsed.port.len >= 0 ) { + port = (unsigned short)atoi( + componentString( canonical, parsed.port ).c_str( ) ); + } else { + port = URL::defaultPort( + componentString( canonical, parsed.scheme ) ); + } + + return URL( componentString( canonical, parsed.scheme ), + componentString( canonical, parsed.host ), + port, + componentString( canonical, parsed.path ), + "", "" ); } diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index 058bba0..080423d 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -7,6 +7,7 @@ #include "HostURLFilter.hpp" #include "MemoryURLSeen.hpp" #include "SimpleURLNormalizer.hpp" +#include "GoogleURLNormalizer.hpp" #include <set> @@ -32,7 +33,8 @@ int main( void ) ChainURLFilter filters( &protocolFilter, &hostFilter ); - URLNormalizer *normalizer = new SimpleURLNormalizer( ); + //URLNormalizer *normalizer = new SimpleURLNormalizer( ); + URLNormalizer *normalizer = new GoogleURLNormalizer( ); Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen ); diff --git a/tests/googleurl/GNUmakefile b/tests/googleurl/GNUmakefile index cd7ba6a..3f685d8 100644 --- a/tests/googleurl/GNUmakefile +++ b/tests/googleurl/GNUmakefile @@ -15,7 +15,8 @@ INCLUDE_LIBS = \ -licui18n -licuuc TEST_CPP_BINS = \ - test1$(EXE) + test1$(EXE) \ + test2$(EXE) OBJS = @@ -24,7 +25,21 @@ OBJS = local_all: local_clean: + -@rm -f *.db *.db-journal 2>/dev/null + -@rm -f *.RES *.DIFF local_distclean: local_test: + @-./exec_test test1 test1 "parse illegal protocol" www.andreasbaumann.cc + @-./exec_test test1 test2 "parse normal start URL without slash" http://www.andreasbaumann.cc + @-./exec_test test1 test3 "parse normal start URL with slash" http://www.andreasbaumann.cc/ + @-./exec_test test1 test4 "parse normal URL" http://www.andreasbaumann.cc/index.html + @-./exec_test test1 test5 "parse normal URL with default port" http://www.andreasbaumann.cc:80/index.html + @-./exec_test test1 test6 "parse normal URL with non-standard port" http://www.andreasbaumann.cc:8080/index.html + @-./exec_test test2 test100 "normalize a relative URL" http://www.andreasbaumann.cc/index.html /software.html + @-./exec_test test2 test101 "absolute URL in HTML content" http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html + @-./exec_test test2 test102 "path normalization, relative path" http://www.andreasbaumann.cc/adir/index.html bdir/page.html + @-./exec_test test2 test103 "path normalization, absolute path" http://www.andreasbaumann.cc/adir/index.html /bdir/page.html + @-./exec_test test2 test104 "path normalization, current dir" http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html + @-./exec_test test2 test105 "path normalization, previous dir" http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html diff --git a/tests/googleurl/exec_test b/tests/googleurl/exec_test new file mode 100755 index 0000000..92b656f --- /dev/null +++ b/tests/googleurl/exec_test @@ -0,0 +1,12 @@ +#!/bin/sh + +BINARY=$1 +shift +ID=$1 +shift +TITLE=$1 +shift + +printf "$ID: $TITLE .. " +./$BINARY $* >$ID.RES 2>&1 +diff $ID.MUST $ID.RES > $ID.DIFF && printf "OK\n" || printf "ERROR\n" diff --git a/tests/googleurl/test1.MUST b/tests/googleurl/test1.MUST new file mode 100644 index 0000000..1b6af48 --- /dev/null +++ b/tests/googleurl/test1.MUST @@ -0,0 +1 @@ +Illegal URL! diff --git a/tests/googleurl/test100.MUST b/tests/googleurl/test100.MUST new file mode 100644 index 0000000..40fb968 --- /dev/null +++ b/tests/googleurl/test100.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /software.html +query: +fragment: +URL: http://www.andreasbaumann.cc/software.html diff --git a/tests/googleurl/test101.MUST b/tests/googleurl/test101.MUST new file mode 100644 index 0000000..b4c5eca --- /dev/null +++ b/tests/googleurl/test101.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.yahoo.com +port: 80 +path: /page.html +query: +fragment: +URL: http://www.yahoo.com/page.html diff --git a/tests/googleurl/test102.MUST b/tests/googleurl/test102.MUST new file mode 100644 index 0000000..7482d26 --- /dev/null +++ b/tests/googleurl/test102.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /adir/bdir/page.html +query: +fragment: +URL: http://www.andreasbaumann.cc/adir/bdir/page.html diff --git a/tests/googleurl/test103.MUST b/tests/googleurl/test103.MUST new file mode 100644 index 0000000..085a06c --- /dev/null +++ b/tests/googleurl/test103.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /bdir/page.html +query: +fragment: +URL: http://www.andreasbaumann.cc/bdir/page.html diff --git a/tests/googleurl/test104.MUST b/tests/googleurl/test104.MUST new file mode 100644 index 0000000..7482d26 --- /dev/null +++ b/tests/googleurl/test104.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /adir/bdir/page.html +query: +fragment: +URL: http://www.andreasbaumann.cc/adir/bdir/page.html diff --git a/tests/googleurl/test105.MUST b/tests/googleurl/test105.MUST new file mode 100644 index 0000000..085a06c --- /dev/null +++ b/tests/googleurl/test105.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /bdir/page.html +query: +fragment: +URL: http://www.andreasbaumann.cc/bdir/page.html diff --git a/tests/googleurl/test2.MUST b/tests/googleurl/test2.MUST new file mode 100644 index 0000000..92158a6 --- /dev/null +++ b/tests/googleurl/test2.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: / +query: +fragment: +URL: http://www.andreasbaumann.cc/ diff --git a/tests/googleurl/test2.cpp b/tests/googleurl/test2.cpp new file mode 100644 index 0000000..8dee689 --- /dev/null +++ b/tests/googleurl/test2.cpp @@ -0,0 +1,37 @@ +#include "URL.hpp" +#include "GoogleURLNormalizer.hpp" + +#include <iostream> +#include <string> + +using namespace std; + +int main( int argc, char *argv[] ) +{ + if( argc != 3 ) { + cerr << "usage: test2 <base url> <partial url>\n" << endl; + return 1; + } + + char *baseUrlString = argv[1]; + char *partialUrlString = argv[2]; + + URLNormalizer *normalizer = new GoogleURLNormalizer( ); + + URL baseUrl = normalizer->parseUrl( baseUrlString ); + + URL url = normalizer->normalize( baseUrl, partialUrlString ); + + cout << "protocol: " << url.protocol( ) << endl + << "host: " << url.host( ) << endl + << "port: " << url.port( ) << endl + << "path: " << url.path( ) << endl + << "query: " << url.query( ) << endl + << "fragment: " << url.fragment( ) << endl; + + cout << "URL: " << url << endl; + + delete normalizer; + + return 0; +} diff --git a/tests/googleurl/test3.MUST b/tests/googleurl/test3.MUST new file mode 100644 index 0000000..92158a6 --- /dev/null +++ b/tests/googleurl/test3.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: / +query: +fragment: +URL: http://www.andreasbaumann.cc/ diff --git a/tests/googleurl/test4.MUST b/tests/googleurl/test4.MUST new file mode 100644 index 0000000..0649e10 --- /dev/null +++ b/tests/googleurl/test4.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /index.html +query: +fragment: +URL: http://www.andreasbaumann.cc/index.html diff --git a/tests/googleurl/test5.MUST b/tests/googleurl/test5.MUST new file mode 100644 index 0000000..0649e10 --- /dev/null +++ b/tests/googleurl/test5.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /index.html +query: +fragment: +URL: http://www.andreasbaumann.cc/index.html diff --git a/tests/googleurl/test6.MUST b/tests/googleurl/test6.MUST new file mode 100644 index 0000000..de9b556 --- /dev/null +++ b/tests/googleurl/test6.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 8080 +path: /index.html +query: +fragment: +URL: http://www.andreasbaumann.cc:8080/index.html |