summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/GoogleURLNormalizer.cpp47
-rw-r--r--src/crawlingwolf.cpp4
-rw-r--r--tests/googleurl/GNUmakefile17
-rwxr-xr-xtests/googleurl/exec_test12
-rw-r--r--tests/googleurl/test1.MUST1
-rw-r--r--tests/googleurl/test100.MUST7
-rw-r--r--tests/googleurl/test101.MUST7
-rw-r--r--tests/googleurl/test102.MUST7
-rw-r--r--tests/googleurl/test103.MUST7
-rw-r--r--tests/googleurl/test104.MUST7
-rw-r--r--tests/googleurl/test105.MUST7
-rw-r--r--tests/googleurl/test2.MUST7
-rw-r--r--tests/googleurl/test2.cpp37
-rw-r--r--tests/googleurl/test3.MUST7
-rw-r--r--tests/googleurl/test4.MUST7
-rw-r--r--tests/googleurl/test5.MUST7
-rw-r--r--tests/googleurl/test6.MUST7
17 files changed, 186 insertions, 9 deletions
diff --git a/src/GoogleURLNormalizer.cpp b/src/GoogleURLNormalizer.cpp
index c49a831..46f1581 100644
--- a/src/GoogleURLNormalizer.cpp
+++ b/src/GoogleURLNormalizer.cpp
@@ -32,10 +32,6 @@ string GoogleURLNormalizer::componentString( const string &s, const Component &c
URL GoogleURLNormalizer::parseUrl( const string s )
{
- if( s.empty( ) ) {
- return URL::Null;
- }
-
string canonical;
canonical.reserve( s.size( ) + 32 );
StdStringCanonOutput output( &canonical );
@@ -66,7 +62,44 @@ URL GoogleURLNormalizer::parseUrl( const string s )
URL GoogleURLNormalizer::normalize( const URL url, const string s )
{
- (void)url;
- (void)s;
- return URL::Null;
+ string urlstr = url.str( );
+ string urlCanonical;
+ urlCanonical.reserve( urlstr.size( ) + 32 );
+ StdStringCanonOutput urlOutput( &urlCanonical );
+ Parsed urlParsed;
+ bool success = Canonicalize(
+ urlstr.data( ), static_cast<int>( urlstr.length( ) ),
+ NULL, &urlOutput, &urlParsed );
+ if( !success ) {
+ return URL::Null;
+ }
+ urlOutput.Complete( );
+
+ string canonical;
+ canonical.reserve( urlstr.size( ) + s.size( ) + 32 );
+ StdStringCanonOutput output( &canonical );
+ Parsed parsed;
+ success = ResolveRelative(
+ urlstr.data( ), static_cast<int>( urlstr.length( ) ), urlParsed,
+ s.data( ), static_cast<int>( s.length( ) ),
+ NULL, &output, &parsed );
+ if( !success ) {
+ return URL::Null;
+ }
+ output.Complete( );
+
+ unsigned short port;
+ if( parsed.port.len >= 0 ) {
+ port = (unsigned short)atoi(
+ componentString( canonical, parsed.port ).c_str( ) );
+ } else {
+ port = URL::defaultPort(
+ componentString( canonical, parsed.scheme ) );
+ }
+
+ return URL( componentString( canonical, parsed.scheme ),
+ componentString( canonical, parsed.host ),
+ port,
+ componentString( canonical, parsed.path ),
+ "", "" );
}
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 058bba0..080423d 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -7,6 +7,7 @@
#include "HostURLFilter.hpp"
#include "MemoryURLSeen.hpp"
#include "SimpleURLNormalizer.hpp"
+#include "GoogleURLNormalizer.hpp"
#include <set>
@@ -32,7 +33,8 @@ int main( void )
ChainURLFilter filters( &protocolFilter, &hostFilter );
- URLNormalizer *normalizer = new SimpleURLNormalizer( );
+ //URLNormalizer *normalizer = new SimpleURLNormalizer( );
+ URLNormalizer *normalizer = new GoogleURLNormalizer( );
Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen );
diff --git a/tests/googleurl/GNUmakefile b/tests/googleurl/GNUmakefile
index cd7ba6a..3f685d8 100644
--- a/tests/googleurl/GNUmakefile
+++ b/tests/googleurl/GNUmakefile
@@ -15,7 +15,8 @@ INCLUDE_LIBS = \
-licui18n -licuuc
TEST_CPP_BINS = \
- test1$(EXE)
+ test1$(EXE) \
+ test2$(EXE)
OBJS =
@@ -24,7 +25,21 @@ OBJS =
local_all:
local_clean:
+ -@rm -f *.db *.db-journal 2>/dev/null
+ -@rm -f *.RES *.DIFF
local_distclean:
local_test:
+ @-./exec_test test1 test1 "parse illegal protocol" www.andreasbaumann.cc
+ @-./exec_test test1 test2 "parse normal start URL without slash" http://www.andreasbaumann.cc
+ @-./exec_test test1 test3 "parse normal start URL with slash" http://www.andreasbaumann.cc/
+ @-./exec_test test1 test4 "parse normal URL" http://www.andreasbaumann.cc/index.html
+ @-./exec_test test1 test5 "parse normal URL with default port" http://www.andreasbaumann.cc:80/index.html
+ @-./exec_test test1 test6 "parse normal URL with non-standard port" http://www.andreasbaumann.cc:8080/index.html
+ @-./exec_test test2 test100 "normalize a relative URL" http://www.andreasbaumann.cc/index.html /software.html
+ @-./exec_test test2 test101 "absolute URL in HTML content" http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html
+ @-./exec_test test2 test102 "path normalization, relative path" http://www.andreasbaumann.cc/adir/index.html bdir/page.html
+ @-./exec_test test2 test103 "path normalization, absolute path" http://www.andreasbaumann.cc/adir/index.html /bdir/page.html
+ @-./exec_test test2 test104 "path normalization, current dir" http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html
+ @-./exec_test test2 test105 "path normalization, previous dir" http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html
diff --git a/tests/googleurl/exec_test b/tests/googleurl/exec_test
new file mode 100755
index 0000000..92b656f
--- /dev/null
+++ b/tests/googleurl/exec_test
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+BINARY=$1
+shift
+ID=$1
+shift
+TITLE=$1
+shift
+
+printf "$ID: $TITLE .. "
+./$BINARY $* >$ID.RES 2>&1
+diff $ID.MUST $ID.RES > $ID.DIFF && printf "OK\n" || printf "ERROR\n"
diff --git a/tests/googleurl/test1.MUST b/tests/googleurl/test1.MUST
new file mode 100644
index 0000000..1b6af48
--- /dev/null
+++ b/tests/googleurl/test1.MUST
@@ -0,0 +1 @@
+Illegal URL!
diff --git a/tests/googleurl/test100.MUST b/tests/googleurl/test100.MUST
new file mode 100644
index 0000000..40fb968
--- /dev/null
+++ b/tests/googleurl/test100.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /software.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/software.html
diff --git a/tests/googleurl/test101.MUST b/tests/googleurl/test101.MUST
new file mode 100644
index 0000000..b4c5eca
--- /dev/null
+++ b/tests/googleurl/test101.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.yahoo.com
+port: 80
+path: /page.html
+query:
+fragment:
+URL: http://www.yahoo.com/page.html
diff --git a/tests/googleurl/test102.MUST b/tests/googleurl/test102.MUST
new file mode 100644
index 0000000..7482d26
--- /dev/null
+++ b/tests/googleurl/test102.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /adir/bdir/page.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/adir/bdir/page.html
diff --git a/tests/googleurl/test103.MUST b/tests/googleurl/test103.MUST
new file mode 100644
index 0000000..085a06c
--- /dev/null
+++ b/tests/googleurl/test103.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /bdir/page.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/bdir/page.html
diff --git a/tests/googleurl/test104.MUST b/tests/googleurl/test104.MUST
new file mode 100644
index 0000000..7482d26
--- /dev/null
+++ b/tests/googleurl/test104.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /adir/bdir/page.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/adir/bdir/page.html
diff --git a/tests/googleurl/test105.MUST b/tests/googleurl/test105.MUST
new file mode 100644
index 0000000..085a06c
--- /dev/null
+++ b/tests/googleurl/test105.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /bdir/page.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/bdir/page.html
diff --git a/tests/googleurl/test2.MUST b/tests/googleurl/test2.MUST
new file mode 100644
index 0000000..92158a6
--- /dev/null
+++ b/tests/googleurl/test2.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/
diff --git a/tests/googleurl/test2.cpp b/tests/googleurl/test2.cpp
new file mode 100644
index 0000000..8dee689
--- /dev/null
+++ b/tests/googleurl/test2.cpp
@@ -0,0 +1,37 @@
+#include "URL.hpp"
+#include "GoogleURLNormalizer.hpp"
+
+#include <iostream>
+#include <string>
+
+using namespace std;
+
+int main( int argc, char *argv[] )
+{
+ if( argc != 3 ) {
+ cerr << "usage: test2 <base url> <partial url>\n" << endl;
+ return 1;
+ }
+
+ char *baseUrlString = argv[1];
+ char *partialUrlString = argv[2];
+
+ URLNormalizer *normalizer = new GoogleURLNormalizer( );
+
+ URL baseUrl = normalizer->parseUrl( baseUrlString );
+
+ URL url = normalizer->normalize( baseUrl, partialUrlString );
+
+ cout << "protocol: " << url.protocol( ) << endl
+ << "host: " << url.host( ) << endl
+ << "port: " << url.port( ) << endl
+ << "path: " << url.path( ) << endl
+ << "query: " << url.query( ) << endl
+ << "fragment: " << url.fragment( ) << endl;
+
+ cout << "URL: " << url << endl;
+
+ delete normalizer;
+
+ return 0;
+}
diff --git a/tests/googleurl/test3.MUST b/tests/googleurl/test3.MUST
new file mode 100644
index 0000000..92158a6
--- /dev/null
+++ b/tests/googleurl/test3.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/
diff --git a/tests/googleurl/test4.MUST b/tests/googleurl/test4.MUST
new file mode 100644
index 0000000..0649e10
--- /dev/null
+++ b/tests/googleurl/test4.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /index.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/index.html
diff --git a/tests/googleurl/test5.MUST b/tests/googleurl/test5.MUST
new file mode 100644
index 0000000..0649e10
--- /dev/null
+++ b/tests/googleurl/test5.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /index.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/index.html
diff --git a/tests/googleurl/test6.MUST b/tests/googleurl/test6.MUST
new file mode 100644
index 0000000..de9b556
--- /dev/null
+++ b/tests/googleurl/test6.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 8080
+path: /index.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc:8080/index.html