summaryrefslogtreecommitdiff
path: root/tests/url
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-07 11:16:16 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-07 11:16:16 +0200
commitd61420eab67f2acab8ea6e3b51a4e763a3259569 (patch)
tree79c51b14700bf6f08c7cc76f7e5e9bed8aef15f2 /tests/url
parent6e2469e60215d7d3d2557ce74370ff32d4442d7c (diff)
downloadcrawler-d61420eab67f2acab8ea6e3b51a4e763a3259569.tar.gz
crawler-d61420eab67f2acab8ea6e3b51a4e763a3259569.tar.bz2
combined the two url normalizer tests
Diffstat (limited to 'tests/url')
-rw-r--r--tests/url/GNUmakefile27
-rw-r--r--tests/url/test1.cpp27
-rw-r--r--tests/url/test2.cpp45
3 files changed, 35 insertions, 64 deletions
diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile
index 5a3ca08..df37f62 100644
--- a/tests/url/GNUmakefile
+++ b/tests/url/GNUmakefile
@@ -11,8 +11,7 @@ INCLUDE_LIBS = \
$(TOPDIR)/src/libcrawlingwolf.a
TEST_CPP_BINS = \
- test1$(EXE) \
- test2$(EXE)
+ test1$(EXE)
OBJS =
@@ -29,16 +28,16 @@ local_distclean:
local_test:
@-for METHOD in simple google; do \
echo "Using URL normalizer '$$METHOD'.." ; \
- ./exec_test test1 test1 "parse illegal protocol" $$METHOD www.andreasbaumann.cc ; \
- ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD http://www.andreasbaumann.cc ; \
- ./exec_test test1 test3 "parse normal start URL with slash" $$METHOD http://www.andreasbaumann.cc/ ; \
- ./exec_test test1 test4 "parse normal URL" $$METHOD http://www.andreasbaumann.cc/index.html ; \
- ./exec_test test1 test5 "parse normal URL with default port" $$METHOD http://www.andreasbaumann.cc:80/index.html ; \
- ./exec_test test1 test6 "parse normal URL with non-standard port" $$METHOD http://www.andreasbaumann.cc:8080/index.html ; \
- ./exec_test test2 test100 "normalize a relative URL" $$METHOD http://www.andreasbaumann.cc/index.html /software.html ; \
- ./exec_test test2 test101 "absolute URL in HTML content" $$METHOD http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html ; \
- ./exec_test test2 test102 "path normalization, relative path" $$METHOD http://www.andreasbaumann.cc/adir/index.html bdir/page.html ; \
- ./exec_test test2 test103 "path normalization, absolute path" $$METHOD http://www.andreasbaumann.cc/adir/index.html /bdir/page.html ; \
- ./exec_test test2 test104 "path normalization, current dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html ; \
- ./exec_test test2 test105 "path normalization, previous dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html ; \
+ ./exec_test test1 test1 "parse illegal protocol" $$METHOD parse www.andreasbaumann.cc ; \
+ ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD parse http://www.andreasbaumann.cc ; \
+ ./exec_test test1 test3 "parse normal start URL with slash" $$METHOD parse http://www.andreasbaumann.cc/ ; \
+ ./exec_test test1 test4 "parse normal URL" $$METHOD parse http://www.andreasbaumann.cc/index.html ; \
+ ./exec_test test1 test5 "parse normal URL with default port" $$METHOD parse http://www.andreasbaumann.cc:80/index.html ; \
+ ./exec_test test1 test6 "parse normal URL with non-standard port" $$METHOD parse http://www.andreasbaumann.cc:8080/index.html ; \
+ ./exec_test test1 test100 "normalize a relative URL" $$METHOD normalize http://www.andreasbaumann.cc/index.html /software.html ; \
+ ./exec_test test1 test101 "absolute URL in HTML content" $$METHOD normalize http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html ; \
+ ./exec_test test1 test102 "path normalization, relative path" $$METHOD normalize http://www.andreasbaumann.cc/adir/index.html bdir/page.html ; \
+ ./exec_test test1 test103 "path normalization, absolute path" $$METHOD normalize http://www.andreasbaumann.cc/adir/index.html /bdir/page.html ; \
+ ./exec_test test1 test104 "path normalization, current dir" $$METHOD normalize http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html ; \
+ ./exec_test test1 test105 "path normalization, previous dir" $$METHOD normalize http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html ; \
done
diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp
index 732d52e..9c0faa6 100644
--- a/tests/url/test1.cpp
+++ b/tests/url/test1.cpp
@@ -11,13 +11,15 @@ using namespace std;
int main( int argc, char *argv[] )
{
- if( argc != 3 ) {
- cerr << "usage: test1 <method> <url>\n" << endl;
+ if( argc < 3 ) {
+ cerr << "usage: test1 <method> <action> <baseUrl> [<relativeUrl>]\n" << endl;
return 1;
}
char *method = argv[1];
- char *urlstring = argv[2];
+ char *action = argv[2];
+ char *baseUrlString = argv[3];
+ char *partialUrlString = argv[4];
vector<string> modules;
modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
@@ -26,8 +28,21 @@ int main( int argc, char *argv[] )
URLNormalizer *normalizer = urlNormalizers.create( method );
- URL url = normalizer->parseUrl( urlstring );
- urlNormalizers.destroy( normalizer );
+ URL url;
+
+ if( strcmp( action, "parse" ) == 0 ) {
+ url = normalizer->parseUrl( baseUrlString );
+ } else if( strcmp( action, "normalize" ) == 0 ) {
+ URL baseUrl = normalizer->parseUrl( baseUrlString );
+ if( baseUrl == URL::Null ) {
+ cerr << "Illegal base URL!" << endl;
+ return 1;
+ }
+ url = normalizer->normalize( baseUrl, partialUrlString );
+ } else {
+ cerr << "Unknown action '" << action << "'" << endl;
+ return 1;
+ }
if( url == URL::Null ) {
cerr << "Illegal URL!" << endl;
@@ -42,6 +57,8 @@ int main( int argc, char *argv[] )
<< "fragment: " << url.fragment( ) << endl;
cout << "URL: " << url << endl;
+
+ urlNormalizers.destroy( normalizer );
return 0;
}
diff --git a/tests/url/test2.cpp b/tests/url/test2.cpp
deleted file mode 100644
index 1d57629..0000000
--- a/tests/url/test2.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "URL.hpp"
-#include "URLNormalizer.hpp"
-#include "ModuleLoader.hpp"
-
-#include <iostream>
-#include <string>
-#include <cstring>
-
-using namespace std;
-
-int main( int argc, char *argv[] )
-{
- if( argc != 4 ) {
- cerr << "usage: test2 <method> <base url> <partial url>\n" << endl;
- return 1;
- }
-
- char *method = argv[1];
- char *baseUrlString = argv[2];
- char *partialUrlString = argv[3];
-
- vector<string> modules;
- modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
- modules.push_back( "../../src/modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
- ModuleLoader<URLNormalizer> urlNormalizers( modules );
-
- URLNormalizer *normalizer = urlNormalizers.create( method );
-
- URL baseUrl = normalizer->parseUrl( baseUrlString );
-
- URL url = normalizer->normalize( baseUrl, partialUrlString );
-
- cout << "protocol: " << url.protocol( ) << endl
- << "host: " << url.host( ) << endl
- << "port: " << url.port( ) << endl
- << "path: " << url.path( ) << endl
- << "query: " << url.query( ) << endl
- << "fragment: " << url.fragment( ) << endl;
-
- cout << "URL: " << url << endl;
-
- urlNormalizers.destroy( normalizer );
-
- return 0;
-}