diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-10 17:05:24 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-10 17:05:24 +0200 |
commit | 540a8d30ffeac5d6214b2c7bc6354e01d3f403d7 (patch) | |
tree | 199e2a1154e5b2ea2e6354db7324eb7d93070a87 | |
parent | 3763777e7811b7efdae57d747f02f51fe4706fc4 (diff) | |
download | crawler-540a8d30ffeac5d6214b2c7bc6354e01d3f403d7.tar.gz crawler-540a8d30ffeac5d6214b2c7bc6354e01d3f403d7.tar.bz2 |
module loader works on Windows, simple URL normalizer test works
-rw-r--r-- | .gitignore | 5 | ||||
-rwxr-xr-x | src/Makefile.W32 | 2 | ||||
-rwxr-xr-x | src/ModuleLoader.hpp | 5 | ||||
-rwxr-xr-x[-rw-r--r--] | src/ModuleRegistry.hpp | 16 | ||||
-rwxr-xr-x | src/TypeInfo.hpp | 25 | ||||
-rwxr-xr-x | src/crawlingwolf.cpp | 22 | ||||
-rw-r--r-- | src/modules/Makefile.W32 | 18 | ||||
-rwxr-xr-x | src/modules/urlnormalizer/Makefile.W32 | 14 | ||||
-rwxr-xr-x | src/modules/urlnormalizer/simpleurl/Makefile.W32 | 45 | ||||
-rwxr-xr-x | tests/url/Makefile.W32 | 16 | ||||
-rwxr-xr-x[-rw-r--r--] | tests/url/test1.cpp | 113 |
11 files changed, 202 insertions, 79 deletions
@@ -11,6 +11,11 @@ *.so *.exe *.exe.manifest +*.pdb +*.ilk +*.dll +*.dll.manifest +*.exp *~ tests/*/test1 tests/*/test2 diff --git a/src/Makefile.W32 b/src/Makefile.W32 index ddedac3..a3dd050 100755 --- a/src/Makefile.W32 +++ b/src/Makefile.W32 @@ -1,6 +1,6 @@ TOPDIR = .. -#SUBDIRS = modules +SUBDIRS = modules !INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp index e7aae37..0f41548 100755 --- a/src/ModuleLoader.hpp +++ b/src/ModuleLoader.hpp @@ -246,5 +246,10 @@ class ModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) > : public BaseModul } }; +#ifndef _WIN32 +#define MODULE_EXT ".so" +#else +#define MODULE_EXT ".dll" +#endif #endif diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp index 2b9f441..e217092 100644..100755 --- a/src/ModuleRegistry.hpp +++ b/src/ModuleRegistry.hpp @@ -75,10 +75,16 @@ struct ModuleRegistry< Interface, TYPELIST_4( P1, P2, P3, P4 ) > { #ifdef SHARED +#ifndef _WIN32 +#define DLLEXPORT +#else +#define DLLEXPORT __declspec( dllexport ) +#endif + // no param macro #define DECLARE_MODULE( baseClass ) \ - extern ModuleRegistry<baseClass> registry ## _ ## baseClass; + extern "C" DLLEXPORT ModuleRegistry<baseClass> registry ## _ ## baseClass; #define REGISTER_MODULE( name, baseClass, subClass ) \ static baseClass *create( ) \ @@ -96,7 +102,7 @@ ModuleRegistry<baseClass> registry ## _ ## baseClass( name, &create, &destroy ); // 1 param macro #define DECLARE_MODULE_1( baseClass, T1 ) \ - extern ModuleRegistry<baseClass, TYPELIST_1( T1 ) > registry ## _ ## baseClass; + extern "C" DLLEXPORT ModuleRegistry<baseClass, TYPELIST_1( T1 ) > registry ## _ ## baseClass; #define REGISTER_MODULE_1( name, baseClass, subClass, T1 ) \ static baseClass *create( T1 t1 ) \ @@ -114,7 +120,7 @@ ModuleRegistry<baseClass, TYPELIST_1( T1 )> registry ## _ ## baseClass( name, &c // 2 param macro #define DECLARE_MODULE_2( baseClass, T1, T2 ) \ - extern ModuleRegistry<baseClass, TYPELIST_2( T1, T2 ) > registry ## _ ## baseClass; + extern "C" DLLEXPORT ModuleRegistry<baseClass, TYPELIST_2( T1, T2 ) > registry ## _ ## baseClass; #define REGISTER_MODULE_2( name, baseClass, subClass, T1, T2 ) \ static baseClass *create( T1 t1, T2 t2 ) \ @@ -132,7 +138,7 @@ ModuleRegistry<baseClass, TYPELIST_2( T1, T2 )> registry ## _ ## baseClass( name // 3 param macro #define DECLARE_MODULE_3( baseClass, T1, T2, T3 ) \ - extern ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 ) > registry ## _ ## baseClass; + extern "C" DLLEXPORT ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 ) > registry ## _ ## baseClass; #define REGISTER_MODULE_3( name, baseClass, subClass, T1, T2, T3 ) \ static baseClass *create( T1 t1, T2 t2, T3 t3 ) \ @@ -150,7 +156,7 @@ ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 )> registry ## _ ## baseClass( // 4 param macro #define DECLARE_MODULE_4( baseClass, T1, T2, T3, T4 ) \ - extern ModuleRegistry<baseClass, TYPELIST_4( T1, T2, T3, T4 ) > registry ## _ ## baseClass; + extern "C" DLLEXPORT ModuleRegistry<baseClass, TYPELIST_4( T1, T2, T3, T4 ) > registry ## _ ## baseClass; #define REGISTER_MODULE_4( name, baseClass, subClass, T1, T2, T3, T4 ) \ static baseClass *create( T1 t1, T2 t2, T3 t3, T4 t4 ) \ diff --git a/src/TypeInfo.hpp b/src/TypeInfo.hpp index 1e049ca..3ca4b57 100755 --- a/src/TypeInfo.hpp +++ b/src/TypeInfo.hpp @@ -42,9 +42,32 @@ std::string demangle( const std::type_info &info ) #ifdef _WIN32 +// TODO: maybe extract into a generic stringutils module +void replaceAll( std::string &s, const std::string &from, const std::string &to ) +{ + if( from.empty( ) ) { + return; + } + + size_t pos = 0; + while( ( pos = s.find( from, pos ) ) != std::string::npos ) { + s.replace( pos, from.length( ), to ); + pos += to.length( ); + } +} + std::string demangle( const std::type_info &info ) { - return info.name( ); + std::string name = info.name( ); + + // MSVC marks metatypes, nice, but gcc doesn't, falling + // back as we can't do the same for gcc + replaceAll( name, "class ", "" ); + // TODO: much more to follow, this is currently just enough + // for the module registry structure with base class + // signature only + + return name; } #else diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index d5a7ba9..67d6383 100755 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -71,41 +71,41 @@ int main( void ) LOG( logNOTICE ) << "Loading modules"; vector<string> normalizerModules; - normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); - normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); + normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple" MODULE_EXT ); + normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl" MODULE_EXT ); ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules ); vector<string> filterModules; - filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); - filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); + filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol" MODULE_EXT ); + filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host" MODULE_EXT ); ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules ); vector<string> filterChainModules; - filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); + filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain" MODULE_EXT ); ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules ); vector<string> frontierModules; - frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" ); + frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory" MODULE_EXT ); ModuleLoader<Frontier> frontiers( frontierModules ); vector<string> fetcherModules; - fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); + fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch" MODULE_EXT ); ModuleLoader<Fetcher> fetchers( fetcherModules ); vector<string> urlseenModules; - urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" ); + urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory" MODULE_EXT ); ModuleLoader<URLSeen> urlSeens( urlseenModules ); vector<string> deduperModules; - deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" ); + deduperModules.push_back( "./modules/deduper/null/mod_deduper_null" MODULE_EXT ); ModuleLoader<Deduper> dedupers( deduperModules ); vector<string> processorModules; - processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); + processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract" MODULE_EXT ); ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules ); vector<string> typeDetectModules; - typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" ); + typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic" MODULE_EXT ); ModuleLoader<TypeDetect> typeDetectors( typeDetectModules ); Frontier *frontier = frontiers.create( "memory" ); diff --git a/src/modules/Makefile.W32 b/src/modules/Makefile.W32 new file mode 100644 index 0000000..fe00419 --- /dev/null +++ b/src/modules/Makefile.W32 @@ -0,0 +1,18 @@ +TOPDIR = ..\.. + +SUBDIRS = \ + urlnormalizer + +#SUBDIRS = \ +# urlnormalizer urlfilter frontier fetcher urlseen \ +# deduper processor typedetect + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_test: diff --git a/src/modules/urlnormalizer/Makefile.W32 b/src/modules/urlnormalizer/Makefile.W32 new file mode 100755 index 0000000..e2b0c2d --- /dev/null +++ b/src/modules/urlnormalizer/Makefile.W32 @@ -0,0 +1,14 @@ +TOPDIR = ..\..\.. + +SUBDIRS = simpleurl +#googleurl + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_test: diff --git a/src/modules/urlnormalizer/simpleurl/Makefile.W32 b/src/modules/urlnormalizer/simpleurl/Makefile.W32 new file mode 100755 index 0000000..b12c7d9 --- /dev/null +++ b/src/modules/urlnormalizer/simpleurl/Makefile.W32 @@ -0,0 +1,45 @@ +TOPDIR = ..\..\..\.. + +SUBDIRS = + +!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk + +INCLUDE_CXXFLAGS = \ + /D_WIN32_WINNT=0x504 /DSHARED + +INCLUDE_DIRS = \ + /I. \ + /I$(TOPDIR)\src + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)\src\crawlingwolf.lib +# kernel32.lib advapi32.lib Ws2_32.lib + +DYNAMIC_MODULE = \ + mod_urlnormalizer_simple.dll + +STATIC_LIB = \ + simpleurlnormalizer.lib + +CPP_OBJS = \ + SimpleURLNormalizer.obj + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +$(STATIC_LIB): $(CPP_OBJS) + $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $? + +$(DYNAMIC_MODULE): $(CPP_OBJS) + $(LINK) /dll /nologo /out:$@ $(LDFLAGS) $(LIBS) $? + +local_all: $(STATIC_LIB) $(DYNAMIC_MODULE) + +local_clean: + @-erase $(LOCAL_STATIC_LIB) 2>NUL + @-erase $(CPP_OBJS) 2>NUL + +local_distclean: + +local_test: diff --git a/tests/url/Makefile.W32 b/tests/url/Makefile.W32 index c2d74e7..573f636 100755 --- a/tests/url/Makefile.W32 +++ b/tests/url/Makefile.W32 @@ -5,26 +5,28 @@ SUBDIRS = !INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk INCLUDE_CXXFLAGS = \ - /D_WIN32_WINNT=0x504 + /D_WIN32_WINNT=0x504 \ + /DUSE_MODULELOADER INCLUDE_DIRS = \ /I. \ - /I$(TOPDIR)\src + /I$(TOPDIR)\src \ + /I$(TOPDIR)\src\modules\urlnormalizer\simpleurl INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ + $(TOPDIR)\src\crawlingwolf.lib \ + $(TOPDIR)\src\modules\urlnormalizer\simpleurl\simpleurlnormalizer.lib TEST_CPP_BINS = \ - test1.exe \ - test2.exe + test1.exe OBJS = !INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk test1.exe: test1.obj -test2.exe: test2.obj local_all: @@ -34,5 +36,5 @@ local_clean: local_distclean: local_test: - @-exec_test test1 "TypeList and TypeTraits" - @-exec_test test2 "TypeInfo C++ demangle" +# @-exec_test test1 "TypeList and TypeTraits" +# @-exec_test test2 "TypeInfo C++ demangle" diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp index b2ee90e..095de24 100644..100755 --- a/tests/url/test1.cpp +++ b/tests/url/test1.cpp @@ -4,7 +4,7 @@ #include "ModuleLoader.hpp" #else #include "SimpleURLNormalizer.hpp" -#include "GoogleURLNormalizer.hpp" +//#include "GoogleURLNormalizer.hpp" #endif #include <vector> @@ -16,71 +16,76 @@ using namespace std; int main( int argc, char *argv[] ) { - if( argc < 3 ) { - cerr << "usage: test1 <method> <action> <baseUrl> [<relativeUrl>]\n" << endl; - return 1; - } - - char *method = argv[1]; - char *action = argv[2]; - char *baseUrlString = argv[3]; - char *partialUrlString = argv[4]; + try { + if( argc < 3 ) { + cerr << "usage: test1 <method> <action> <baseUrl> [<relativeUrl>]\n" << endl; + return 1; + } + + char *method = argv[1]; + char *action = argv[2]; + char *baseUrlString = argv[3]; + char *partialUrlString = argv[4]; #ifdef USE_MODULELOADER - vector<string> modules; - modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); - modules.push_back( "../../src/modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); - ModuleLoader<URLNormalizer> urlNormalizers( modules ); + vector<string> modules; + // modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple" MODULE_EXT ); + modules.push_back( "..\\..\\src\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple" MODULE_EXT ); + // modules.push_back( "../../src/modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl" MODULE_EXT ); + ModuleLoader<URLNormalizer> urlNormalizers( modules ); - URLNormalizer *normalizer = urlNormalizers.create( method ); + URLNormalizer *normalizer = urlNormalizers.create( method ); #else - URLNormalizer *normalizer; - if( strcmp( method, "simple" ) == 0 ) { - normalizer = new SimpleURLNormalizer( ); - } else if( strcmp( method, "google" ) == 0 ) { - normalizer = new GoogleURLNormalizer( ); - } else { - cerr << "Unknown normalization method '" << method << "'" << endl; - return 1; - } + URLNormalizer *normalizer; + if( strcmp( method, "simple" ) == 0 ) { + normalizer = new SimpleURLNormalizer( ); + // } else if( strcmp( method, "google" ) == 0 ) { + // normalizer = new GoogleURLNormalizer( ); + } else { + cerr << "Unknown normalization method '" << method << "'" << endl; + return 1; + } #endif - - - URL url; - - if( strcmp( action, "parse" ) == 0 ) { - url = normalizer->parseUrl( baseUrlString ); - } else if( strcmp( action, "normalize" ) == 0 ) { - URL baseUrl = normalizer->parseUrl( baseUrlString ); - if( baseUrl == URL::Null ) { - cerr << "Illegal base URL!" << endl; + + URL url; + + if( strcmp( action, "parse" ) == 0 ) { + url = normalizer->parseUrl( baseUrlString ); + } else if( strcmp( action, "normalize" ) == 0 ) { + URL baseUrl = normalizer->parseUrl( baseUrlString ); + if( baseUrl == URL::Null ) { + cerr << "Illegal base URL!" << endl; + return 1; + } + url = normalizer->normalize( baseUrl, partialUrlString ); + } else { + cerr << "Unknown action '" << action << "'" << endl; return 1; } - url = normalizer->normalize( baseUrl, partialUrlString ); - } else { - cerr << "Unknown action '" << action << "'" << endl; - return 1; - } - if( url == URL::Null ) { - cerr << "Illegal URL!" << endl; - return 1; - } - - cout << "protocol: " << url.protocol( ) << endl - << "host: " << url.host( ) << endl - << "port: " << url.port( ) << endl - << "path: " << url.path( ) << endl - << "query: " << url.query( ) << endl - << "fragment: " << url.fragment( ) << endl; + if( url == URL::Null ) { + cerr << "Illegal URL!" << endl; + return 1; + } + + cout << "protocol: " << url.protocol( ) << endl + << "host: " << url.host( ) << endl + << "port: " << url.port( ) << endl + << "path: " << url.path( ) << endl + << "query: " << url.query( ) << endl + << "fragment: " << url.fragment( ) << endl; - cout << "URL: " << url << endl; + cout << "URL: " << url << endl; #ifdef USE_MODULELOADER - urlNormalizers.destroy( normalizer ); + urlNormalizers.destroy( normalizer ); #else - delete normalizer; + delete normalizer; #endif - return 0; + return 0; + } catch( exception &e ) { + cerr << e.what( ) << endl; + return 1; + } } |