summaryrefslogtreecommitdiff
path: root/tests/url
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-04 22:48:27 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-04 22:48:27 +0200
commitd78ee0aaa1f4bb4aebfee974d15a3ca65bce2467 (patch)
treedd2fbb81cd50314939912432dabacfcaf50a5ff4 /tests/url
parentf48058b91dc4eb326e7e2bd732044ed7b26f70f8 (diff)
downloadcrawler-d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467.tar.gz
crawler-d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467.tar.bz2
unified URL normalization tests
Diffstat (limited to 'tests/url')
-rw-r--r--tests/url/GNUmakefile46
-rwxr-xr-xtests/url/exec_test12
-rw-r--r--tests/url/test1.MUST1
-rw-r--r--tests/url/test1.cpp48
-rw-r--r--tests/url/test100.MUST7
-rw-r--r--tests/url/test101.MUST7
-rw-r--r--tests/url/test102.MUST7
-rw-r--r--tests/url/test103.MUST7
-rw-r--r--tests/url/test104.MUST7
-rw-r--r--tests/url/test105.MUST7
-rw-r--r--tests/url/test2.MUST7
-rw-r--r--tests/url/test2.cpp47
-rw-r--r--tests/url/test3.MUST7
-rw-r--r--tests/url/test4.MUST7
-rw-r--r--tests/url/test5.MUST7
-rw-r--r--tests/url/test6.MUST7
16 files changed, 231 insertions, 0 deletions
diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile
new file mode 100644
index 0000000..38645c9
--- /dev/null
+++ b/tests/url/GNUmakefile
@@ -0,0 +1,46 @@
+TOPDIR = ../..
+
+SUBDIRS =
+
+INCLUDE_DIRS = \
+ -I$(TOPDIR)/src
+
+INCLUDE_LDFLAGS =
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a \
+ $(TOPDIR)/googleurl/libgoogleurl.a \
+ -licui18n -licuuc
+
+TEST_CPP_BINS = \
+ test1$(EXE) \
+ test2$(EXE)
+
+OBJS =
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+ -@rm -f *.db *.db-journal 2>/dev/null
+ -@rm -f *.RES *.DIFF
+
+local_distclean:
+
+local_test:
+ @-for METHOD in simple google; do \
+ echo "Using URL normalizer '$$METHOD'.." ; \
+ ./exec_test test1 test1 "parse illegal protocol" $$METHOD www.andreasbaumann.cc ; \
+ ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD http://www.andreasbaumann.cc ; \
+ ./exec_test test1 test3 "parse normal start URL with slash" $$METHOD http://www.andreasbaumann.cc/ ; \
+ ./exec_test test1 test4 "parse normal URL" $$METHOD http://www.andreasbaumann.cc/index.html ; \
+ ./exec_test test1 test5 "parse normal URL with default port" $$METHOD http://www.andreasbaumann.cc:80/index.html ; \
+ ./exec_test test1 test6 "parse normal URL with non-standard port" $$METHOD http://www.andreasbaumann.cc:8080/index.html ; \
+ ./exec_test test2 test100 "normalize a relative URL" $$METHOD http://www.andreasbaumann.cc/index.html /software.html ; \
+ ./exec_test test2 test101 "absolute URL in HTML content" $$METHOD http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html ; \
+ ./exec_test test2 test102 "path normalization, relative path" $$METHOD http://www.andreasbaumann.cc/adir/index.html bdir/page.html ; \
+ ./exec_test test2 test103 "path normalization, absolute path" $$METHOD http://www.andreasbaumann.cc/adir/index.html /bdir/page.html ; \
+ ./exec_test test2 test104 "path normalization, current dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html ; \
+ ./exec_test test2 test105 "path normalization, previous dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html ; \
+ done
diff --git a/tests/url/exec_test b/tests/url/exec_test
new file mode 100755
index 0000000..92b656f
--- /dev/null
+++ b/tests/url/exec_test
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+BINARY=$1
+shift
+ID=$1
+shift
+TITLE=$1
+shift
+
+printf "$ID: $TITLE .. "
+./$BINARY $* >$ID.RES 2>&1
+diff $ID.MUST $ID.RES > $ID.DIFF && printf "OK\n" || printf "ERROR\n"
diff --git a/tests/url/test1.MUST b/tests/url/test1.MUST
new file mode 100644
index 0000000..1b6af48
--- /dev/null
+++ b/tests/url/test1.MUST
@@ -0,0 +1 @@
+Illegal URL!
diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp
new file mode 100644
index 0000000..5fd3e90
--- /dev/null
+++ b/tests/url/test1.cpp
@@ -0,0 +1,48 @@
+#include "URL.hpp"
+#include "SimpleURLNormalizer.hpp"
+#include "GoogleURLNormalizer.hpp"
+
+#include <iostream>
+#include <string>
+#include <cstring>
+
+using namespace std;
+
+int main( int argc, char *argv[] )
+{
+ if( argc != 3 ) {
+ cerr << "usage: test1 <method> <url>\n" << endl;
+ return 1;
+ }
+
+ char *method = argv[1];
+ char *urlstring = argv[2];
+
+ URLNormalizer *normalizer;
+ if( strcmp( method, "simple" ) == 0 ) {
+ normalizer = new SimpleURLNormalizer( );
+ } else if( strcmp( method, "google" ) == 0 ) {
+ normalizer = new GoogleURLNormalizer( );
+ } else {
+ cerr << "illegal method '" << method << "'" << endl;
+ }
+
+ URL url = normalizer->parseUrl( urlstring );
+ delete normalizer;
+
+ if( url == URL::Null ) {
+ cerr << "Illegal URL!" << endl;
+ return 1;
+ }
+
+ cout << "protocol: " << url.protocol( ) << endl
+ << "host: " << url.host( ) << endl
+ << "port: " << url.port( ) << endl
+ << "path: " << url.path( ) << endl
+ << "query: " << url.query( ) << endl
+ << "fragment: " << url.fragment( ) << endl;
+
+ cout << "URL: " << url << endl;
+
+ return 0;
+}
diff --git a/tests/url/test100.MUST b/tests/url/test100.MUST
new file mode 100644
index 0000000..40fb968
--- /dev/null
+++ b/tests/url/test100.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /software.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/software.html
diff --git a/tests/url/test101.MUST b/tests/url/test101.MUST
new file mode 100644
index 0000000..b4c5eca
--- /dev/null
+++ b/tests/url/test101.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.yahoo.com
+port: 80
+path: /page.html
+query:
+fragment:
+URL: http://www.yahoo.com/page.html
diff --git a/tests/url/test102.MUST b/tests/url/test102.MUST
new file mode 100644
index 0000000..7482d26
--- /dev/null
+++ b/tests/url/test102.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /adir/bdir/page.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/adir/bdir/page.html
diff --git a/tests/url/test103.MUST b/tests/url/test103.MUST
new file mode 100644
index 0000000..085a06c
--- /dev/null
+++ b/tests/url/test103.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /bdir/page.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/bdir/page.html
diff --git a/tests/url/test104.MUST b/tests/url/test104.MUST
new file mode 100644
index 0000000..7482d26
--- /dev/null
+++ b/tests/url/test104.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /adir/bdir/page.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/adir/bdir/page.html
diff --git a/tests/url/test105.MUST b/tests/url/test105.MUST
new file mode 100644
index 0000000..085a06c
--- /dev/null
+++ b/tests/url/test105.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /bdir/page.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/bdir/page.html
diff --git a/tests/url/test2.MUST b/tests/url/test2.MUST
new file mode 100644
index 0000000..92158a6
--- /dev/null
+++ b/tests/url/test2.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/
diff --git a/tests/url/test2.cpp b/tests/url/test2.cpp
new file mode 100644
index 0000000..fb660a3
--- /dev/null
+++ b/tests/url/test2.cpp
@@ -0,0 +1,47 @@
+#include "URL.hpp"
+#include "SimpleURLNormalizer.hpp"
+#include "GoogleURLNormalizer.hpp"
+
+#include <iostream>
+#include <string>
+#include <cstring>
+
+using namespace std;
+
+int main( int argc, char *argv[] )
+{
+ if( argc != 4 ) {
+ cerr << "usage: test2 <method> <base url> <partial url>\n" << endl;
+ return 1;
+ }
+
+ char *method = argv[1];
+ char *baseUrlString = argv[2];
+ char *partialUrlString = argv[3];
+
+ URLNormalizer *normalizer;
+ if( strcmp( method, "simple" ) == 0 ) {
+ normalizer = new SimpleURLNormalizer( );
+ } else if( strcmp( method, "google" ) == 0 ) {
+ normalizer = new GoogleURLNormalizer( );
+ } else {
+ cerr << "illegal method '" << method << "'" << endl;
+ }
+
+ URL baseUrl = normalizer->parseUrl( baseUrlString );
+
+ URL url = normalizer->normalize( baseUrl, partialUrlString );
+
+ cout << "protocol: " << url.protocol( ) << endl
+ << "host: " << url.host( ) << endl
+ << "port: " << url.port( ) << endl
+ << "path: " << url.path( ) << endl
+ << "query: " << url.query( ) << endl
+ << "fragment: " << url.fragment( ) << endl;
+
+ cout << "URL: " << url << endl;
+
+ delete normalizer;
+
+ return 0;
+}
diff --git a/tests/url/test3.MUST b/tests/url/test3.MUST
new file mode 100644
index 0000000..92158a6
--- /dev/null
+++ b/tests/url/test3.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/
diff --git a/tests/url/test4.MUST b/tests/url/test4.MUST
new file mode 100644
index 0000000..0649e10
--- /dev/null
+++ b/tests/url/test4.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /index.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/index.html
diff --git a/tests/url/test5.MUST b/tests/url/test5.MUST
new file mode 100644
index 0000000..0649e10
--- /dev/null
+++ b/tests/url/test5.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 80
+path: /index.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc/index.html
diff --git a/tests/url/test6.MUST b/tests/url/test6.MUST
new file mode 100644
index 0000000..de9b556
--- /dev/null
+++ b/tests/url/test6.MUST
@@ -0,0 +1,7 @@
+protocol: http
+host: www.andreasbaumann.cc
+port: 8080
+path: /index.html
+query:
+fragment:
+URL: http://www.andreasbaumann.cc:8080/index.html