summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-04 22:48:27 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-04 22:48:27 +0200
commitd78ee0aaa1f4bb4aebfee974d15a3ca65bce2467 (patch)
treedd2fbb81cd50314939912432dabacfcaf50a5ff4
parentf48058b91dc4eb326e7e2bd732044ed7b26f70f8 (diff)
downloadcrawler-d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467.tar.gz
crawler-d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467.tar.bz2
unified URL normalization tests
-rw-r--r--tests/GNUmakefile2
-rw-r--r--tests/simpleurl/GNUmakefile41
-rw-r--r--tests/url/GNUmakefile46
-rwxr-xr-xtests/url/exec_test (renamed from tests/simpleurl/exec_test)0
-rw-r--r--tests/url/test1.MUST (renamed from tests/simpleurl/test1.MUST)0
-rw-r--r--tests/url/test1.cpp (renamed from tests/simpleurl/test1.cpp)19
-rw-r--r--tests/url/test100.MUST (renamed from tests/simpleurl/test100.MUST)0
-rw-r--r--tests/url/test101.MUST (renamed from tests/simpleurl/test101.MUST)0
-rw-r--r--tests/url/test102.MUST (renamed from tests/simpleurl/test102.MUST)0
-rw-r--r--tests/url/test103.MUST (renamed from tests/simpleurl/test103.MUST)0
-rw-r--r--tests/url/test104.MUST (renamed from tests/simpleurl/test104.MUST)0
-rw-r--r--tests/url/test105.MUST (renamed from tests/simpleurl/test105.MUST)0
-rw-r--r--tests/url/test2.MUST (renamed from tests/simpleurl/test2.MUST)0
-rw-r--r--tests/url/test2.cpp (renamed from tests/simpleurl/test2.cpp)20
-rw-r--r--tests/url/test3.MUST (renamed from tests/simpleurl/test3.MUST)0
-rw-r--r--tests/url/test4.MUST (renamed from tests/simpleurl/test4.MUST)0
-rw-r--r--tests/url/test5.MUST (renamed from tests/simpleurl/test5.MUST)0
-rw-r--r--tests/url/test6.MUST (renamed from tests/simpleurl/test6.MUST)0
18 files changed, 77 insertions, 51 deletions
diff --git a/tests/GNUmakefile b/tests/GNUmakefile
index 8931f49..f582bbb 100644
--- a/tests/GNUmakefile
+++ b/tests/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ..
-SUBDIRS = simpleurl googleurl streamhtmlparser libfetch curl psql sqlite
+SUBDIRS = url streamhtmlparser libfetch curl psql sqlite
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/tests/simpleurl/GNUmakefile b/tests/simpleurl/GNUmakefile
deleted file mode 100644
index f0a28b0..0000000
--- a/tests/simpleurl/GNUmakefile
+++ /dev/null
@@ -1,41 +0,0 @@
-TOPDIR = ../..
-
-SUBDIRS =
-
-INCLUDE_DIRS = \
- -I$(TOPDIR)/src
-
-INCLUDE_LDFLAGS =
-
-INCLUDE_LIBS = \
- $(TOPDIR)/src/libcrawlingwolf.a
-
-TEST_CPP_BINS = \
- test1$(EXE) \
- test2$(EXE)
-
-OBJS =
-
--include $(TOPDIR)/makefiles/gmake/sub.mk
-
-local_all:
-
-local_clean:
- -@rm -f *.db *.db-journal 2>/dev/null
- -@rm -f *.RES *.DIFF
-
-local_distclean:
-
-local_test:
- @-./exec_test test1 test1 "parse illegal protocol" www.andreasbaumann.cc
- @-./exec_test test1 test2 "parse normal start URL without slash" http://www.andreasbaumann.cc
- @-./exec_test test1 test3 "parse normal start URL with slash" http://www.andreasbaumann.cc/
- @-./exec_test test1 test4 "parse normal URL" http://www.andreasbaumann.cc/index.html
- @-./exec_test test1 test5 "parse normal URL with default port" http://www.andreasbaumann.cc:80/index.html
- @-./exec_test test1 test6 "parse normal URL with non-standard port" http://www.andreasbaumann.cc:8080/index.html
- @-./exec_test test2 test100 "normalize a relative URL" http://www.andreasbaumann.cc/index.html /software.html
- @-./exec_test test2 test101 "absolute URL in HTML content" http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html
- @-./exec_test test2 test102 "path normalization, relative path" http://www.andreasbaumann.cc/adir/index.html bdir/page.html
- @-./exec_test test2 test103 "path normalization, absolute path" http://www.andreasbaumann.cc/adir/index.html /bdir/page.html
- @-./exec_test test2 test104 "path normalization, current dir" http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html
- @-./exec_test test2 test105 "path normalization, previous dir" http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html
diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile
new file mode 100644
index 0000000..38645c9
--- /dev/null
+++ b/tests/url/GNUmakefile
@@ -0,0 +1,46 @@
+TOPDIR = ../..
+
+SUBDIRS =
+
+INCLUDE_DIRS = \
+ -I$(TOPDIR)/src
+
+INCLUDE_LDFLAGS =
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a \
+ $(TOPDIR)/googleurl/libgoogleurl.a \
+ -licui18n -licuuc
+
+TEST_CPP_BINS = \
+ test1$(EXE) \
+ test2$(EXE)
+
+OBJS =
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+ -@rm -f *.db *.db-journal 2>/dev/null
+ -@rm -f *.RES *.DIFF
+
+local_distclean:
+
+local_test:
+ @-for METHOD in simple google; do \
+ echo "Using URL normalizer '$$METHOD'.." ; \
+ ./exec_test test1 test1 "parse illegal protocol" $$METHOD www.andreasbaumann.cc ; \
+ ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD http://www.andreasbaumann.cc ; \
+ ./exec_test test1 test3 "parse normal start URL with slash" $$METHOD http://www.andreasbaumann.cc/ ; \
+ ./exec_test test1 test4 "parse normal URL" $$METHOD http://www.andreasbaumann.cc/index.html ; \
+ ./exec_test test1 test5 "parse normal URL with default port" $$METHOD http://www.andreasbaumann.cc:80/index.html ; \
+ ./exec_test test1 test6 "parse normal URL with non-standard port" $$METHOD http://www.andreasbaumann.cc:8080/index.html ; \
+ ./exec_test test2 test100 "normalize a relative URL" $$METHOD http://www.andreasbaumann.cc/index.html /software.html ; \
+ ./exec_test test2 test101 "absolute URL in HTML content" $$METHOD http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html ; \
+ ./exec_test test2 test102 "path normalization, relative path" $$METHOD http://www.andreasbaumann.cc/adir/index.html bdir/page.html ; \
+ ./exec_test test2 test103 "path normalization, absolute path" $$METHOD http://www.andreasbaumann.cc/adir/index.html /bdir/page.html ; \
+ ./exec_test test2 test104 "path normalization, current dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html ; \
+ ./exec_test test2 test105 "path normalization, previous dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html ; \
+ done
diff --git a/tests/simpleurl/exec_test b/tests/url/exec_test
index 92b656f..92b656f 100755
--- a/tests/simpleurl/exec_test
+++ b/tests/url/exec_test
diff --git a/tests/simpleurl/test1.MUST b/tests/url/test1.MUST
index 1b6af48..1b6af48 100644
--- a/tests/simpleurl/test1.MUST
+++ b/tests/url/test1.MUST
diff --git a/tests/simpleurl/test1.cpp b/tests/url/test1.cpp
index 23c7d74..5fd3e90 100644
--- a/tests/simpleurl/test1.cpp
+++ b/tests/url/test1.cpp
@@ -1,21 +1,32 @@
#include "URL.hpp"
#include "SimpleURLNormalizer.hpp"
+#include "GoogleURLNormalizer.hpp"
#include <iostream>
#include <string>
+#include <cstring>
using namespace std;
int main( int argc, char *argv[] )
{
- if( argc != 2 ) {
- cerr << "usage: test1 <url>\n" << endl;
+ if( argc != 3 ) {
+ cerr << "usage: test1 <method> <url>\n" << endl;
return 1;
}
- char *urlstring = argv[1];
+ char *method = argv[1];
+ char *urlstring = argv[2];
- URLNormalizer *normalizer = new SimpleURLNormalizer( );
+ URLNormalizer *normalizer;
+ if( strcmp( method, "simple" ) == 0 ) {
+ normalizer = new SimpleURLNormalizer( );
+ } else if( strcmp( method, "google" ) == 0 ) {
+ normalizer = new GoogleURLNormalizer( );
+ } else {
+ cerr << "illegal method '" << method << "'" << endl;
+ }
+
URL url = normalizer->parseUrl( urlstring );
delete normalizer;
diff --git a/tests/simpleurl/test100.MUST b/tests/url/test100.MUST
index 40fb968..40fb968 100644
--- a/tests/simpleurl/test100.MUST
+++ b/tests/url/test100.MUST
diff --git a/tests/simpleurl/test101.MUST b/tests/url/test101.MUST
index b4c5eca..b4c5eca 100644
--- a/tests/simpleurl/test101.MUST
+++ b/tests/url/test101.MUST
diff --git a/tests/simpleurl/test102.MUST b/tests/url/test102.MUST
index 7482d26..7482d26 100644
--- a/tests/simpleurl/test102.MUST
+++ b/tests/url/test102.MUST
diff --git a/tests/simpleurl/test103.MUST b/tests/url/test103.MUST
index 085a06c..085a06c 100644
--- a/tests/simpleurl/test103.MUST
+++ b/tests/url/test103.MUST
diff --git a/tests/simpleurl/test104.MUST b/tests/url/test104.MUST
index 7482d26..7482d26 100644
--- a/tests/simpleurl/test104.MUST
+++ b/tests/url/test104.MUST
diff --git a/tests/simpleurl/test105.MUST b/tests/url/test105.MUST
index 085a06c..085a06c 100644
--- a/tests/simpleurl/test105.MUST
+++ b/tests/url/test105.MUST
diff --git a/tests/simpleurl/test2.MUST b/tests/url/test2.MUST
index 92158a6..92158a6 100644
--- a/tests/simpleurl/test2.MUST
+++ b/tests/url/test2.MUST
diff --git a/tests/simpleurl/test2.cpp b/tests/url/test2.cpp
index 4b6aa0d..fb660a3 100644
--- a/tests/simpleurl/test2.cpp
+++ b/tests/url/test2.cpp
@@ -1,22 +1,32 @@
#include "URL.hpp"
#include "SimpleURLNormalizer.hpp"
+#include "GoogleURLNormalizer.hpp"
#include <iostream>
#include <string>
+#include <cstring>
using namespace std;
int main( int argc, char *argv[] )
{
- if( argc != 3 ) {
- cerr << "usage: test2 <base url> <partial url>\n" << endl;
+ if( argc != 4 ) {
+ cerr << "usage: test2 <method> <base url> <partial url>\n" << endl;
return 1;
}
- char *baseUrlString = argv[1];
- char *partialUrlString = argv[2];
+ char *method = argv[1];
+ char *baseUrlString = argv[2];
+ char *partialUrlString = argv[3];
- URLNormalizer *normalizer = new SimpleURLNormalizer( );
+ URLNormalizer *normalizer;
+ if( strcmp( method, "simple" ) == 0 ) {
+ normalizer = new SimpleURLNormalizer( );
+ } else if( strcmp( method, "google" ) == 0 ) {
+ normalizer = new GoogleURLNormalizer( );
+ } else {
+ cerr << "illegal method '" << method << "'" << endl;
+ }
URL baseUrl = normalizer->parseUrl( baseUrlString );
diff --git a/tests/simpleurl/test3.MUST b/tests/url/test3.MUST
index 92158a6..92158a6 100644
--- a/tests/simpleurl/test3.MUST
+++ b/tests/url/test3.MUST
diff --git a/tests/simpleurl/test4.MUST b/tests/url/test4.MUST
index 0649e10..0649e10 100644
--- a/tests/simpleurl/test4.MUST
+++ b/tests/url/test4.MUST
diff --git a/tests/simpleurl/test5.MUST b/tests/url/test5.MUST
index 0649e10..0649e10 100644
--- a/tests/simpleurl/test5.MUST
+++ b/tests/url/test5.MUST
diff --git a/tests/simpleurl/test6.MUST b/tests/url/test6.MUST
index de9b556..de9b556 100644
--- a/tests/simpleurl/test6.MUST
+++ b/tests/url/test6.MUST