diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-07 11:16:16 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-07 11:16:16 +0200 |
commit | d61420eab67f2acab8ea6e3b51a4e763a3259569 (patch) | |
tree | 79c51b14700bf6f08c7cc76f7e5e9bed8aef15f2 | |
parent | 6e2469e60215d7d3d2557ce74370ff32d4442d7c (diff) | |
download | crawler-d61420eab67f2acab8ea6e3b51a4e763a3259569.tar.gz crawler-d61420eab67f2acab8ea6e3b51a4e763a3259569.tar.bz2 |
combined the two url normalizer tests
-rw-r--r-- | docs/LINKS | 1 | ||||
-rw-r--r-- | src/ModuleLoader.hpp | 7 | ||||
-rw-r--r-- | src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp | 2 | ||||
-rw-r--r-- | src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.hpp | 2 | ||||
-rw-r--r-- | tests/url/GNUmakefile | 27 | ||||
-rw-r--r-- | tests/url/test1.cpp | 27 | ||||
-rw-r--r-- | tests/url/test2.cpp | 45 |
7 files changed, 45 insertions, 66 deletions
@@ -41,3 +41,4 @@ Loadable modules in C++ http://www.isotton.com/devel/docs/C++-dlopen-mini-HOWTO/C++-dlopen-mini-HOWTO.html http://www.linuxjournal.com/article/3687?page=0,1 http://www.artima.com/cppsource/subscription_problem.html +http://kristiannielsen.livejournal.com/11783.html diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp index 0e3fe3d..2c88ed6 100644 --- a/src/ModuleLoader.hpp +++ b/src/ModuleLoader.hpp @@ -53,8 +53,11 @@ class ModuleLoader { ~ModuleLoader<Interface>( ) { - for( typename mapType::const_iterator it = m_modules.begin( ); it != m_modules.end( ); it++ ) { - dlclose( (*it).second.handle ); + for( typename mapType::iterator it = m_modules.begin( ); it != m_modules.end( ); it++ ) { + if( (*it).second.handle ) { + dlclose( (*it).second.handle ); + (*it).second.handle = 0; + } } } diff --git a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp index 414adf1..7e7d1ac 100644 --- a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp +++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp @@ -7,10 +7,12 @@ using namespace std; SimpleURLNormalizer::SimpleURLNormalizer( ) { + test = malloc(10); } SimpleURLNormalizer::~SimpleURLNormalizer( ) { + free( test ); } URL SimpleURLNormalizer::parseUrl( const string s ) diff --git a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.hpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.hpp index 5bd454c..1b1de60 100644 --- a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.hpp +++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.hpp @@ -16,6 +16,8 @@ class SimpleURLNormalizer : public URLNormalizer { private: void normalizePath( std::string &path ); + + void *test; }; DECLARE_MODULE( URLNormalizer ) diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile index 5a3ca08..df37f62 100644 --- a/tests/url/GNUmakefile +++ b/tests/url/GNUmakefile @@ -11,8 +11,7 @@ INCLUDE_LIBS = \ $(TOPDIR)/src/libcrawlingwolf.a TEST_CPP_BINS = \ - test1$(EXE) \ - test2$(EXE) + test1$(EXE) OBJS = @@ -29,16 +28,16 @@ local_distclean: local_test: @-for METHOD in simple google; do \ echo "Using URL normalizer '$$METHOD'.." ; \ - ./exec_test test1 test1 "parse illegal protocol" $$METHOD www.andreasbaumann.cc ; \ - ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD http://www.andreasbaumann.cc ; \ - ./exec_test test1 test3 "parse normal start URL with slash" $$METHOD http://www.andreasbaumann.cc/ ; \ - ./exec_test test1 test4 "parse normal URL" $$METHOD http://www.andreasbaumann.cc/index.html ; \ - ./exec_test test1 test5 "parse normal URL with default port" $$METHOD http://www.andreasbaumann.cc:80/index.html ; \ - ./exec_test test1 test6 "parse normal URL with non-standard port" $$METHOD http://www.andreasbaumann.cc:8080/index.html ; \ - ./exec_test test2 test100 "normalize a relative URL" $$METHOD http://www.andreasbaumann.cc/index.html /software.html ; \ - ./exec_test test2 test101 "absolute URL in HTML content" $$METHOD http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html ; \ - ./exec_test test2 test102 "path normalization, relative path" $$METHOD http://www.andreasbaumann.cc/adir/index.html bdir/page.html ; \ - ./exec_test test2 test103 "path normalization, absolute path" $$METHOD http://www.andreasbaumann.cc/adir/index.html /bdir/page.html ; \ - ./exec_test test2 test104 "path normalization, current dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html ; \ - ./exec_test test2 test105 "path normalization, previous dir" $$METHOD http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html ; \ + ./exec_test test1 test1 "parse illegal protocol" $$METHOD parse www.andreasbaumann.cc ; \ + ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD parse http://www.andreasbaumann.cc ; \ + ./exec_test test1 test3 "parse normal start URL with slash" $$METHOD parse http://www.andreasbaumann.cc/ ; \ + ./exec_test test1 test4 "parse normal URL" $$METHOD parse http://www.andreasbaumann.cc/index.html ; \ + ./exec_test test1 test5 "parse normal URL with default port" $$METHOD parse http://www.andreasbaumann.cc:80/index.html ; \ + ./exec_test test1 test6 "parse normal URL with non-standard port" $$METHOD parse http://www.andreasbaumann.cc:8080/index.html ; \ + ./exec_test test1 test100 "normalize a relative URL" $$METHOD normalize http://www.andreasbaumann.cc/index.html /software.html ; \ + ./exec_test test1 test101 "absolute URL in HTML content" $$METHOD normalize http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html ; \ + ./exec_test test1 test102 "path normalization, relative path" $$METHOD normalize http://www.andreasbaumann.cc/adir/index.html bdir/page.html ; \ + ./exec_test test1 test103 "path normalization, absolute path" $$METHOD normalize http://www.andreasbaumann.cc/adir/index.html /bdir/page.html ; \ + ./exec_test test1 test104 "path normalization, current dir" $$METHOD normalize http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html ; \ + ./exec_test test1 test105 "path normalization, previous dir" $$METHOD normalize http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html ; \ done diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp index 732d52e..9c0faa6 100644 --- a/tests/url/test1.cpp +++ b/tests/url/test1.cpp @@ -11,13 +11,15 @@ using namespace std; int main( int argc, char *argv[] ) { - if( argc != 3 ) { - cerr << "usage: test1 <method> <url>\n" << endl; + if( argc < 3 ) { + cerr << "usage: test1 <method> <action> <baseUrl> [<relativeUrl>]\n" << endl; return 1; } char *method = argv[1]; - char *urlstring = argv[2]; + char *action = argv[2]; + char *baseUrlString = argv[3]; + char *partialUrlString = argv[4]; vector<string> modules; modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); @@ -26,8 +28,21 @@ int main( int argc, char *argv[] ) URLNormalizer *normalizer = urlNormalizers.create( method ); - URL url = normalizer->parseUrl( urlstring ); - urlNormalizers.destroy( normalizer ); + URL url; + + if( strcmp( action, "parse" ) == 0 ) { + url = normalizer->parseUrl( baseUrlString ); + } else if( strcmp( action, "normalize" ) == 0 ) { + URL baseUrl = normalizer->parseUrl( baseUrlString ); + if( baseUrl == URL::Null ) { + cerr << "Illegal base URL!" << endl; + return 1; + } + url = normalizer->normalize( baseUrl, partialUrlString ); + } else { + cerr << "Unknown action '" << action << "'" << endl; + return 1; + } if( url == URL::Null ) { cerr << "Illegal URL!" << endl; @@ -42,6 +57,8 @@ int main( int argc, char *argv[] ) << "fragment: " << url.fragment( ) << endl; cout << "URL: " << url << endl; + + urlNormalizers.destroy( normalizer ); return 0; } diff --git a/tests/url/test2.cpp b/tests/url/test2.cpp deleted file mode 100644 index 1d57629..0000000 --- a/tests/url/test2.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "URL.hpp" -#include "URLNormalizer.hpp" -#include "ModuleLoader.hpp" - -#include <iostream> -#include <string> -#include <cstring> - -using namespace std; - -int main( int argc, char *argv[] ) -{ - if( argc != 4 ) { - cerr << "usage: test2 <method> <base url> <partial url>\n" << endl; - return 1; - } - - char *method = argv[1]; - char *baseUrlString = argv[2]; - char *partialUrlString = argv[3]; - - vector<string> modules; - modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); - modules.push_back( "../../src/modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); - ModuleLoader<URLNormalizer> urlNormalizers( modules ); - - URLNormalizer *normalizer = urlNormalizers.create( method ); - - URL baseUrl = normalizer->parseUrl( baseUrlString ); - - URL url = normalizer->normalize( baseUrl, partialUrlString ); - - cout << "protocol: " << url.protocol( ) << endl - << "host: " << url.host( ) << endl - << "port: " << url.port( ) << endl - << "path: " << url.path( ) << endl - << "query: " << url.query( ) << endl - << "fragment: " << url.fragment( ) << endl; - - cout << "URL: " << url << endl; - - urlNormalizers.destroy( normalizer ); - - return 0; -} |