diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-04 22:48:27 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-04 22:48:27 +0200 |
commit | d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467 (patch) | |
tree | dd2fbb81cd50314939912432dabacfcaf50a5ff4 | |
parent | f48058b91dc4eb326e7e2bd732044ed7b26f70f8 (diff) | |
download | crawler-d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467.tar.gz crawler-d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467.tar.bz2 |
unified URL normalization tests
-rw-r--r-- | tests/GNUmakefile | 2 | ||||
-rw-r--r-- | tests/simpleurl/GNUmakefile | 41 | ||||
-rw-r--r-- | tests/url/GNUmakefile | 46 | ||||
-rwxr-xr-x | tests/url/exec_test (renamed from tests/simpleurl/exec_test) | 0 | ||||
-rw-r--r-- | tests/url/test1.MUST (renamed from tests/simpleurl/test1.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test1.cpp (renamed from tests/simpleurl/test1.cpp) | 19 | ||||
-rw-r--r-- | tests/url/test100.MUST (renamed from tests/simpleurl/test100.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test101.MUST (renamed from tests/simpleurl/test101.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test102.MUST (renamed from tests/simpleurl/test102.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test103.MUST (renamed from tests/simpleurl/test103.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test104.MUST (renamed from tests/simpleurl/test104.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test105.MUST (renamed from tests/simpleurl/test105.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test2.MUST (renamed from tests/simpleurl/test2.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test2.cpp (renamed from tests/simpleurl/test2.cpp) | 20 | ||||
-rw-r--r-- | tests/url/test3.MUST (renamed from tests/simpleurl/test3.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test4.MUST (renamed from tests/simpleurl/test4.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test5.MUST (renamed from tests/simpleurl/test5.MUST) | 0 | ||||
-rw-r--r-- | tests/url/test6.MUST (renamed from tests/simpleurl/test6.MUST) | 0 |
18 files changed, 77 insertions, 51 deletions
diff --git a/tests/GNUmakefile b/tests/GNUmakefile index 8931f49..f582bbb 100644 --- a/tests/GNUmakefile +++ b/tests/GNUmakefile @@ -1,6 +1,6 @@ TOPDIR = .. -SUBDIRS = simpleurl googleurl streamhtmlparser libfetch curl psql sqlite +SUBDIRS = url streamhtmlparser libfetch curl psql sqlite -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/tests/simpleurl/GNUmakefile b/tests/simpleurl/GNUmakefile deleted file mode 100644 index f0a28b0..0000000 --- a/tests/simpleurl/GNUmakefile +++ /dev/null @@ -1,41 +0,0 @@ -TOPDIR = ../.. - -SUBDIRS = - -INCLUDE_DIRS = \ - -I$(TOPDIR)/src - -INCLUDE_LDFLAGS = - -INCLUDE_LIBS = \ - $(TOPDIR)/src/libcrawlingwolf.a - -TEST_CPP_BINS = \ - test1$(EXE) \ - test2$(EXE) - -OBJS = - --include $(TOPDIR)/makefiles/gmake/sub.mk - -local_all: - -local_clean: - -@rm -f *.db *.db-journal 2>/dev/null - -@rm -f *.RES *.DIFF - -local_distclean: - -local_test: - @-./exec_test test1 test1 "parse illegal protocol" www.andreasbaumann.cc - @-./exec_test test1 test2 "parse normal start URL without slash" http://www.andreasbaumann.cc - @-./exec_test test1 test3 "parse normal start URL with slash" http://www.andreasbaumann.cc/ - @-./exec_test test1 test4 "parse normal URL" http://www.andreasbaumann.cc/index.html - @-./exec_test test1 test5 "parse normal URL with default port" http://www.andreasbaumann.cc:80/index.html - @-./exec_test test1 test6 "parse normal URL with non-standard port" http://www.andreasbaumann.cc:8080/index.html - @-./exec_test test2 test100 "normalize a relative URL" http://www.andreasbaumann.cc/index.html /software.html - @-./exec_test test2 test101 "absolute URL in HTML content" http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html - @-./exec_test test2 test102 "path normalization, relative path" http://www.andreasbaumann.cc/adir/index.html bdir/page.html - @-./exec_test test2 test103 "path normalization, absolute path" http://www.andreasbaumann.cc/adir/index.html /bdir/page.html - @-./exec_test test2 test104 "path normalization, current dir" http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html - @-./exec_test test2 test105 "path normalization, previous dir" http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile new file mode 100644 index 0000000..38645c9 --- /dev/null +++ b/tests/url/GNUmakefile @@ -0,0 +1,46 @@ +TOPDIR = ../.. + +SUBDIRS = + +INCLUDE_DIRS = \ + -I$(TOPDIR)/src + +INCLUDE_LDFLAGS = + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a \ + $(TOPDIR)/googleurl/libgoogleurl.a \ + -licui18n -licuuc + +TEST_CPP_BINS = \ + test1$(EXE) \ + test2$(EXE) + +OBJS = + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + -@rm -f *.db *.db-journal 2>/dev/null + -@rm -f *.RES *.DIFF + +local_distclean: + +local_test: + @-for METHOD in simple google; do \ + echo "Using URL normalizer '$$METHOD'.." ; \ + ./exec_test test1 test1 "parse illegal protocol" $$METHOD www.andreasbaumann.cc ; \ + ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD http://www.andreasbaumann.cc ; \ + ./exec_test test1 test3 "parse normal start URL with slash" $$METHOD http://www.andreasbaumann.cc/ ; \ + ./exec_test test1 test4 "parse normal URL" $$METHOD http://www.andreasbaumann.cc/index.html ; \ + ./exec_test test1 test5 "parse normal URL with default port" $$METHOD http://www.andreasbaumann.cc:80/index.html ; \ + ./exec_test test1 test6 "parse normal URL with non-standard port" $$METHOD http://www.andreasbaumann.cc:8080/index.html ; \ + ./exec_test test2 test100 "normalize a relative URL" $$METHOD http://www.andreasbaumann.cc/index.html /software.html ; \ + ./exec_test test2 test101 "absolute URL in HTML content" $$METHOD http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html ; \ + ./exec_test test2 test102 "path normalization, relative path" $$METHOD http://www.andreasbaumann.cc/adir/index.html bdir/page.html ; \ + ./exec_test test2 test103 "path normalization, absolute path" $$METHOD http://www.andreasbaumann.cc/adir/index.html /bdir/page.html ; \ + ./exec_test test2 test104 "path normalization, current dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html ; \ + ./exec_test test2 test105 "path normalization, previous dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html ; \ + done diff --git a/tests/simpleurl/exec_test b/tests/url/exec_test index 92b656f..92b656f 100755 --- a/tests/simpleurl/exec_test +++ b/tests/url/exec_test diff --git a/tests/simpleurl/test1.MUST b/tests/url/test1.MUST index 1b6af48..1b6af48 100644 --- a/tests/simpleurl/test1.MUST +++ b/tests/url/test1.MUST diff --git a/tests/simpleurl/test1.cpp b/tests/url/test1.cpp index 23c7d74..5fd3e90 100644 --- a/tests/simpleurl/test1.cpp +++ b/tests/url/test1.cpp @@ -1,21 +1,32 @@ #include "URL.hpp" #include "SimpleURLNormalizer.hpp" +#include "GoogleURLNormalizer.hpp" #include <iostream> #include <string> +#include <cstring> using namespace std; int main( int argc, char *argv[] ) { - if( argc != 2 ) { - cerr << "usage: test1 <url>\n" << endl; + if( argc != 3 ) { + cerr << "usage: test1 <method> <url>\n" << endl; return 1; } - char *urlstring = argv[1]; + char *method = argv[1]; + char *urlstring = argv[2]; - URLNormalizer *normalizer = new SimpleURLNormalizer( ); + URLNormalizer *normalizer; + if( strcmp( method, "simple" ) == 0 ) { + normalizer = new SimpleURLNormalizer( ); + } else if( strcmp( method, "google" ) == 0 ) { + normalizer = new GoogleURLNormalizer( ); + } else { + cerr << "illegal method '" << method << "'" << endl; + } + URL url = normalizer->parseUrl( urlstring ); delete normalizer; diff --git a/tests/simpleurl/test100.MUST b/tests/url/test100.MUST index 40fb968..40fb968 100644 --- a/tests/simpleurl/test100.MUST +++ b/tests/url/test100.MUST diff --git a/tests/simpleurl/test101.MUST b/tests/url/test101.MUST index b4c5eca..b4c5eca 100644 --- a/tests/simpleurl/test101.MUST +++ b/tests/url/test101.MUST diff --git a/tests/simpleurl/test102.MUST b/tests/url/test102.MUST index 7482d26..7482d26 100644 --- a/tests/simpleurl/test102.MUST +++ b/tests/url/test102.MUST diff --git a/tests/simpleurl/test103.MUST b/tests/url/test103.MUST index 085a06c..085a06c 100644 --- a/tests/simpleurl/test103.MUST +++ b/tests/url/test103.MUST diff --git a/tests/simpleurl/test104.MUST b/tests/url/test104.MUST index 7482d26..7482d26 100644 --- a/tests/simpleurl/test104.MUST +++ b/tests/url/test104.MUST diff --git a/tests/simpleurl/test105.MUST b/tests/url/test105.MUST index 085a06c..085a06c 100644 --- a/tests/simpleurl/test105.MUST +++ b/tests/url/test105.MUST diff --git a/tests/simpleurl/test2.MUST b/tests/url/test2.MUST index 92158a6..92158a6 100644 --- a/tests/simpleurl/test2.MUST +++ b/tests/url/test2.MUST diff --git a/tests/simpleurl/test2.cpp b/tests/url/test2.cpp index 4b6aa0d..fb660a3 100644 --- a/tests/simpleurl/test2.cpp +++ b/tests/url/test2.cpp @@ -1,22 +1,32 @@ #include "URL.hpp" #include "SimpleURLNormalizer.hpp" +#include "GoogleURLNormalizer.hpp" #include <iostream> #include <string> +#include <cstring> using namespace std; int main( int argc, char *argv[] ) { - if( argc != 3 ) { - cerr << "usage: test2 <base url> <partial url>\n" << endl; + if( argc != 4 ) { + cerr << "usage: test2 <method> <base url> <partial url>\n" << endl; return 1; } - char *baseUrlString = argv[1]; - char *partialUrlString = argv[2]; + char *method = argv[1]; + char *baseUrlString = argv[2]; + char *partialUrlString = argv[3]; - URLNormalizer *normalizer = new SimpleURLNormalizer( ); + URLNormalizer *normalizer; + if( strcmp( method, "simple" ) == 0 ) { + normalizer = new SimpleURLNormalizer( ); + } else if( strcmp( method, "google" ) == 0 ) { + normalizer = new GoogleURLNormalizer( ); + } else { + cerr << "illegal method '" << method << "'" << endl; + } URL baseUrl = normalizer->parseUrl( baseUrlString ); diff --git a/tests/simpleurl/test3.MUST b/tests/url/test3.MUST index 92158a6..92158a6 100644 --- a/tests/simpleurl/test3.MUST +++ b/tests/url/test3.MUST diff --git a/tests/simpleurl/test4.MUST b/tests/url/test4.MUST index 0649e10..0649e10 100644 --- a/tests/simpleurl/test4.MUST +++ b/tests/url/test4.MUST diff --git a/tests/simpleurl/test5.MUST b/tests/url/test5.MUST index 0649e10..0649e10 100644 --- a/tests/simpleurl/test5.MUST +++ b/tests/url/test5.MUST diff --git a/tests/simpleurl/test6.MUST b/tests/url/test6.MUST index de9b556..de9b556 100644 --- a/tests/simpleurl/test6.MUST +++ b/tests/url/test6.MUST |