diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-04 22:48:27 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-04 22:48:27 +0200 |
commit | d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467 (patch) | |
tree | dd2fbb81cd50314939912432dabacfcaf50a5ff4 /tests/url | |
parent | f48058b91dc4eb326e7e2bd732044ed7b26f70f8 (diff) | |
download | crawler-d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467.tar.gz crawler-d78ee0aaa1f4bb4aebfee974d15a3ca65bce2467.tar.bz2 |
unified URL normalization tests
Diffstat (limited to 'tests/url')
-rw-r--r-- | tests/url/GNUmakefile | 46 | ||||
-rwxr-xr-x | tests/url/exec_test | 12 | ||||
-rw-r--r-- | tests/url/test1.MUST | 1 | ||||
-rw-r--r-- | tests/url/test1.cpp | 48 | ||||
-rw-r--r-- | tests/url/test100.MUST | 7 | ||||
-rw-r--r-- | tests/url/test101.MUST | 7 | ||||
-rw-r--r-- | tests/url/test102.MUST | 7 | ||||
-rw-r--r-- | tests/url/test103.MUST | 7 | ||||
-rw-r--r-- | tests/url/test104.MUST | 7 | ||||
-rw-r--r-- | tests/url/test105.MUST | 7 | ||||
-rw-r--r-- | tests/url/test2.MUST | 7 | ||||
-rw-r--r-- | tests/url/test2.cpp | 47 | ||||
-rw-r--r-- | tests/url/test3.MUST | 7 | ||||
-rw-r--r-- | tests/url/test4.MUST | 7 | ||||
-rw-r--r-- | tests/url/test5.MUST | 7 | ||||
-rw-r--r-- | tests/url/test6.MUST | 7 |
16 files changed, 231 insertions, 0 deletions
diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile new file mode 100644 index 0000000..38645c9 --- /dev/null +++ b/tests/url/GNUmakefile @@ -0,0 +1,46 @@ +TOPDIR = ../.. + +SUBDIRS = + +INCLUDE_DIRS = \ + -I$(TOPDIR)/src + +INCLUDE_LDFLAGS = + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a \ + $(TOPDIR)/googleurl/libgoogleurl.a \ + -licui18n -licuuc + +TEST_CPP_BINS = \ + test1$(EXE) \ + test2$(EXE) + +OBJS = + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + -@rm -f *.db *.db-journal 2>/dev/null + -@rm -f *.RES *.DIFF + +local_distclean: + +local_test: + @-for METHOD in simple google; do \ + echo "Using URL normalizer '$$METHOD'.." ; \ + ./exec_test test1 test1 "parse illegal protocol" $$METHOD www.andreasbaumann.cc ; \ + ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD http://www.andreasbaumann.cc ; \ + ./exec_test test1 test3 "parse normal start URL with slash" $$METHOD http://www.andreasbaumann.cc/ ; \ + ./exec_test test1 test4 "parse normal URL" $$METHOD http://www.andreasbaumann.cc/index.html ; \ + ./exec_test test1 test5 "parse normal URL with default port" $$METHOD http://www.andreasbaumann.cc:80/index.html ; \ + ./exec_test test1 test6 "parse normal URL with non-standard port" $$METHOD http://www.andreasbaumann.cc:8080/index.html ; \ + ./exec_test test2 test100 "normalize a relative URL" $$METHOD http://www.andreasbaumann.cc/index.html /software.html ; \ + ./exec_test test2 test101 "absolute URL in HTML content" $$METHOD http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html ; \ + ./exec_test test2 test102 "path normalization, relative path" $$METHOD http://www.andreasbaumann.cc/adir/index.html bdir/page.html ; \ + ./exec_test test2 test103 "path normalization, absolute path" $$METHOD http://www.andreasbaumann.cc/adir/index.html /bdir/page.html ; \ + ./exec_test test2 test104 "path normalization, current dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html ; \ + ./exec_test test2 test105 "path normalization, previous dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html ; \ + done diff --git a/tests/url/exec_test b/tests/url/exec_test new file mode 100755 index 0000000..92b656f --- /dev/null +++ b/tests/url/exec_test @@ -0,0 +1,12 @@ +#!/bin/sh + +BINARY=$1 +shift +ID=$1 +shift +TITLE=$1 +shift + +printf "$ID: $TITLE .. " +./$BINARY $* >$ID.RES 2>&1 +diff $ID.MUST $ID.RES > $ID.DIFF && printf "OK\n" || printf "ERROR\n" diff --git a/tests/url/test1.MUST b/tests/url/test1.MUST new file mode 100644 index 0000000..1b6af48 --- /dev/null +++ b/tests/url/test1.MUST @@ -0,0 +1 @@ +Illegal URL! diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp new file mode 100644 index 0000000..5fd3e90 --- /dev/null +++ b/tests/url/test1.cpp @@ -0,0 +1,48 @@ +#include "URL.hpp" +#include "SimpleURLNormalizer.hpp" +#include "GoogleURLNormalizer.hpp" + +#include <iostream> +#include <string> +#include <cstring> + +using namespace std; + +int main( int argc, char *argv[] ) +{ + if( argc != 3 ) { + cerr << "usage: test1 <method> <url>\n" << endl; + return 1; + } + + char *method = argv[1]; + char *urlstring = argv[2]; + + URLNormalizer *normalizer; + if( strcmp( method, "simple" ) == 0 ) { + normalizer = new SimpleURLNormalizer( ); + } else if( strcmp( method, "google" ) == 0 ) { + normalizer = new GoogleURLNormalizer( ); + } else { + cerr << "illegal method '" << method << "'" << endl; + } + + URL url = normalizer->parseUrl( urlstring ); + delete normalizer; + + if( url == URL::Null ) { + cerr << "Illegal URL!" << endl; + return 1; + } + + cout << "protocol: " << url.protocol( ) << endl + << "host: " << url.host( ) << endl + << "port: " << url.port( ) << endl + << "path: " << url.path( ) << endl + << "query: " << url.query( ) << endl + << "fragment: " << url.fragment( ) << endl; + + cout << "URL: " << url << endl; + + return 0; +} diff --git a/tests/url/test100.MUST b/tests/url/test100.MUST new file mode 100644 index 0000000..40fb968 --- /dev/null +++ b/tests/url/test100.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /software.html +query: +fragment: +URL: http://www.andreasbaumann.cc/software.html diff --git a/tests/url/test101.MUST b/tests/url/test101.MUST new file mode 100644 index 0000000..b4c5eca --- /dev/null +++ b/tests/url/test101.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.yahoo.com +port: 80 +path: /page.html +query: +fragment: +URL: http://www.yahoo.com/page.html diff --git a/tests/url/test102.MUST b/tests/url/test102.MUST new file mode 100644 index 0000000..7482d26 --- /dev/null +++ b/tests/url/test102.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /adir/bdir/page.html +query: +fragment: +URL: http://www.andreasbaumann.cc/adir/bdir/page.html diff --git a/tests/url/test103.MUST b/tests/url/test103.MUST new file mode 100644 index 0000000..085a06c --- /dev/null +++ b/tests/url/test103.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /bdir/page.html +query: +fragment: +URL: http://www.andreasbaumann.cc/bdir/page.html diff --git a/tests/url/test104.MUST b/tests/url/test104.MUST new file mode 100644 index 0000000..7482d26 --- /dev/null +++ b/tests/url/test104.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /adir/bdir/page.html +query: +fragment: +URL: http://www.andreasbaumann.cc/adir/bdir/page.html diff --git a/tests/url/test105.MUST b/tests/url/test105.MUST new file mode 100644 index 0000000..085a06c --- /dev/null +++ b/tests/url/test105.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /bdir/page.html +query: +fragment: +URL: http://www.andreasbaumann.cc/bdir/page.html diff --git a/tests/url/test2.MUST b/tests/url/test2.MUST new file mode 100644 index 0000000..92158a6 --- /dev/null +++ b/tests/url/test2.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: / +query: +fragment: +URL: http://www.andreasbaumann.cc/ diff --git a/tests/url/test2.cpp b/tests/url/test2.cpp new file mode 100644 index 0000000..fb660a3 --- /dev/null +++ b/tests/url/test2.cpp @@ -0,0 +1,47 @@ +#include "URL.hpp" +#include "SimpleURLNormalizer.hpp" +#include "GoogleURLNormalizer.hpp" + +#include <iostream> +#include <string> +#include <cstring> + +using namespace std; + +int main( int argc, char *argv[] ) +{ + if( argc != 4 ) { + cerr << "usage: test2 <method> <base url> <partial url>\n" << endl; + return 1; + } + + char *method = argv[1]; + char *baseUrlString = argv[2]; + char *partialUrlString = argv[3]; + + URLNormalizer *normalizer; + if( strcmp( method, "simple" ) == 0 ) { + normalizer = new SimpleURLNormalizer( ); + } else if( strcmp( method, "google" ) == 0 ) { + normalizer = new GoogleURLNormalizer( ); + } else { + cerr << "illegal method '" << method << "'" << endl; + } + + URL baseUrl = normalizer->parseUrl( baseUrlString ); + + URL url = normalizer->normalize( baseUrl, partialUrlString ); + + cout << "protocol: " << url.protocol( ) << endl + << "host: " << url.host( ) << endl + << "port: " << url.port( ) << endl + << "path: " << url.path( ) << endl + << "query: " << url.query( ) << endl + << "fragment: " << url.fragment( ) << endl; + + cout << "URL: " << url << endl; + + delete normalizer; + + return 0; +} diff --git a/tests/url/test3.MUST b/tests/url/test3.MUST new file mode 100644 index 0000000..92158a6 --- /dev/null +++ b/tests/url/test3.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: / +query: +fragment: +URL: http://www.andreasbaumann.cc/ diff --git a/tests/url/test4.MUST b/tests/url/test4.MUST new file mode 100644 index 0000000..0649e10 --- /dev/null +++ b/tests/url/test4.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /index.html +query: +fragment: +URL: http://www.andreasbaumann.cc/index.html diff --git a/tests/url/test5.MUST b/tests/url/test5.MUST new file mode 100644 index 0000000..0649e10 --- /dev/null +++ b/tests/url/test5.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 80 +path: /index.html +query: +fragment: +URL: http://www.andreasbaumann.cc/index.html diff --git a/tests/url/test6.MUST b/tests/url/test6.MUST new file mode 100644 index 0000000..de9b556 --- /dev/null +++ b/tests/url/test6.MUST @@ -0,0 +1,7 @@ +protocol: http +host: www.andreasbaumann.cc +port: 8080 +path: /index.html +query: +fragment: +URL: http://www.andreasbaumann.cc:8080/index.html |