summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore5
-rwxr-xr-xsrc/Makefile.W322
-rwxr-xr-xsrc/ModuleLoader.hpp5
-rwxr-xr-x[-rw-r--r--]src/ModuleRegistry.hpp16
-rwxr-xr-xsrc/TypeInfo.hpp25
-rwxr-xr-xsrc/crawlingwolf.cpp22
-rw-r--r--src/modules/Makefile.W3218
-rwxr-xr-xsrc/modules/urlnormalizer/Makefile.W3214
-rwxr-xr-xsrc/modules/urlnormalizer/simpleurl/Makefile.W3245
-rwxr-xr-xtests/url/Makefile.W3216
-rwxr-xr-x[-rw-r--r--]tests/url/test1.cpp113
11 files changed, 202 insertions, 79 deletions
diff --git a/.gitignore b/.gitignore
index f631773..71fd749 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,11 @@
*.so
*.exe
*.exe.manifest
+*.pdb
+*.ilk
+*.dll
+*.dll.manifest
+*.exp
*~
tests/*/test1
tests/*/test2
diff --git a/src/Makefile.W32 b/src/Makefile.W32
index ddedac3..a3dd050 100755
--- a/src/Makefile.W32
+++ b/src/Makefile.W32
@@ -1,6 +1,6 @@
TOPDIR = ..
-#SUBDIRS = modules
+SUBDIRS = modules
!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp
index e7aae37..0f41548 100755
--- a/src/ModuleLoader.hpp
+++ b/src/ModuleLoader.hpp
@@ -246,5 +246,10 @@ class ModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) > : public BaseModul
}
};
+#ifndef _WIN32
+#define MODULE_EXT ".so"
+#else
+#define MODULE_EXT ".dll"
+#endif
#endif
diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp
index 2b9f441..e217092 100644..100755
--- a/src/ModuleRegistry.hpp
+++ b/src/ModuleRegistry.hpp
@@ -75,10 +75,16 @@ struct ModuleRegistry< Interface, TYPELIST_4( P1, P2, P3, P4 ) > {
#ifdef SHARED
+#ifndef _WIN32
+#define DLLEXPORT
+#else
+#define DLLEXPORT __declspec( dllexport )
+#endif
+
// no param macro
#define DECLARE_MODULE( baseClass ) \
- extern ModuleRegistry<baseClass> registry ## _ ## baseClass;
+ extern "C" DLLEXPORT ModuleRegistry<baseClass> registry ## _ ## baseClass;
#define REGISTER_MODULE( name, baseClass, subClass ) \
static baseClass *create( ) \
@@ -96,7 +102,7 @@ ModuleRegistry<baseClass> registry ## _ ## baseClass( name, &create, &destroy );
// 1 param macro
#define DECLARE_MODULE_1( baseClass, T1 ) \
- extern ModuleRegistry<baseClass, TYPELIST_1( T1 ) > registry ## _ ## baseClass;
+ extern "C" DLLEXPORT ModuleRegistry<baseClass, TYPELIST_1( T1 ) > registry ## _ ## baseClass;
#define REGISTER_MODULE_1( name, baseClass, subClass, T1 ) \
static baseClass *create( T1 t1 ) \
@@ -114,7 +120,7 @@ ModuleRegistry<baseClass, TYPELIST_1( T1 )> registry ## _ ## baseClass( name, &c
// 2 param macro
#define DECLARE_MODULE_2( baseClass, T1, T2 ) \
- extern ModuleRegistry<baseClass, TYPELIST_2( T1, T2 ) > registry ## _ ## baseClass;
+ extern "C" DLLEXPORT ModuleRegistry<baseClass, TYPELIST_2( T1, T2 ) > registry ## _ ## baseClass;
#define REGISTER_MODULE_2( name, baseClass, subClass, T1, T2 ) \
static baseClass *create( T1 t1, T2 t2 ) \
@@ -132,7 +138,7 @@ ModuleRegistry<baseClass, TYPELIST_2( T1, T2 )> registry ## _ ## baseClass( name
// 3 param macro
#define DECLARE_MODULE_3( baseClass, T1, T2, T3 ) \
- extern ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 ) > registry ## _ ## baseClass;
+ extern "C" DLLEXPORT ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 ) > registry ## _ ## baseClass;
#define REGISTER_MODULE_3( name, baseClass, subClass, T1, T2, T3 ) \
static baseClass *create( T1 t1, T2 t2, T3 t3 ) \
@@ -150,7 +156,7 @@ ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 )> registry ## _ ## baseClass(
// 4 param macro
#define DECLARE_MODULE_4( baseClass, T1, T2, T3, T4 ) \
- extern ModuleRegistry<baseClass, TYPELIST_4( T1, T2, T3, T4 ) > registry ## _ ## baseClass;
+ extern "C" DLLEXPORT ModuleRegistry<baseClass, TYPELIST_4( T1, T2, T3, T4 ) > registry ## _ ## baseClass;
#define REGISTER_MODULE_4( name, baseClass, subClass, T1, T2, T3, T4 ) \
static baseClass *create( T1 t1, T2 t2, T3 t3, T4 t4 ) \
diff --git a/src/TypeInfo.hpp b/src/TypeInfo.hpp
index 1e049ca..3ca4b57 100755
--- a/src/TypeInfo.hpp
+++ b/src/TypeInfo.hpp
@@ -42,9 +42,32 @@ std::string demangle( const std::type_info &info )
#ifdef _WIN32
+// TODO: maybe extract into a generic stringutils module
+void replaceAll( std::string &s, const std::string &from, const std::string &to )
+{
+ if( from.empty( ) ) {
+ return;
+ }
+
+ size_t pos = 0;
+ while( ( pos = s.find( from, pos ) ) != std::string::npos ) {
+ s.replace( pos, from.length( ), to );
+ pos += to.length( );
+ }
+}
+
std::string demangle( const std::type_info &info )
{
- return info.name( );
+ std::string name = info.name( );
+
+ // MSVC marks metatypes, nice, but gcc doesn't, falling
+ // back as we can't do the same for gcc
+ replaceAll( name, "class ", "" );
+ // TODO: much more to follow, this is currently just enough
+ // for the module registry structure with base class
+ // signature only
+
+ return name;
}
#else
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index d5a7ba9..67d6383 100755
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -71,41 +71,41 @@ int main( void )
LOG( logNOTICE ) << "Loading modules";
vector<string> normalizerModules;
- normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
- normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
+ normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple" MODULE_EXT );
+ normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl" MODULE_EXT );
ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules );
vector<string> filterModules;
- filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" );
- filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" );
+ filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol" MODULE_EXT );
+ filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host" MODULE_EXT );
ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules );
vector<string> filterChainModules;
- filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" );
+ filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain" MODULE_EXT );
ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules );
vector<string> frontierModules;
- frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" );
+ frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory" MODULE_EXT );
ModuleLoader<Frontier> frontiers( frontierModules );
vector<string> fetcherModules;
- fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" );
+ fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch" MODULE_EXT );
ModuleLoader<Fetcher> fetchers( fetcherModules );
vector<string> urlseenModules;
- urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" );
+ urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory" MODULE_EXT );
ModuleLoader<URLSeen> urlSeens( urlseenModules );
vector<string> deduperModules;
- deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" );
+ deduperModules.push_back( "./modules/deduper/null/mod_deduper_null" MODULE_EXT );
ModuleLoader<Deduper> dedupers( deduperModules );
vector<string> processorModules;
- processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" );
+ processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract" MODULE_EXT );
ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules );
vector<string> typeDetectModules;
- typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" );
+ typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic" MODULE_EXT );
ModuleLoader<TypeDetect> typeDetectors( typeDetectModules );
Frontier *frontier = frontiers.create( "memory" );
diff --git a/src/modules/Makefile.W32 b/src/modules/Makefile.W32
new file mode 100644
index 0000000..fe00419
--- /dev/null
+++ b/src/modules/Makefile.W32
@@ -0,0 +1,18 @@
+TOPDIR = ..\..
+
+SUBDIRS = \
+ urlnormalizer
+
+#SUBDIRS = \
+# urlnormalizer urlfilter frontier fetcher urlseen \
+# deduper processor typedetect
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_test:
diff --git a/src/modules/urlnormalizer/Makefile.W32 b/src/modules/urlnormalizer/Makefile.W32
new file mode 100755
index 0000000..e2b0c2d
--- /dev/null
+++ b/src/modules/urlnormalizer/Makefile.W32
@@ -0,0 +1,14 @@
+TOPDIR = ..\..\..
+
+SUBDIRS = simpleurl
+#googleurl
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_test:
diff --git a/src/modules/urlnormalizer/simpleurl/Makefile.W32 b/src/modules/urlnormalizer/simpleurl/Makefile.W32
new file mode 100755
index 0000000..b12c7d9
--- /dev/null
+++ b/src/modules/urlnormalizer/simpleurl/Makefile.W32
@@ -0,0 +1,45 @@
+TOPDIR = ..\..\..\..
+
+SUBDIRS =
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
+
+INCLUDE_CXXFLAGS = \
+ /D_WIN32_WINNT=0x504 /DSHARED
+
+INCLUDE_DIRS = \
+ /I. \
+ /I$(TOPDIR)\src
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)\src\crawlingwolf.lib
+# kernel32.lib advapi32.lib Ws2_32.lib
+
+DYNAMIC_MODULE = \
+ mod_urlnormalizer_simple.dll
+
+STATIC_LIB = \
+ simpleurlnormalizer.lib
+
+CPP_OBJS = \
+ SimpleURLNormalizer.obj
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+$(STATIC_LIB): $(CPP_OBJS)
+ $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $?
+
+$(DYNAMIC_MODULE): $(CPP_OBJS)
+ $(LINK) /dll /nologo /out:$@ $(LDFLAGS) $(LIBS) $?
+
+local_all: $(STATIC_LIB) $(DYNAMIC_MODULE)
+
+local_clean:
+ @-erase $(LOCAL_STATIC_LIB) 2>NUL
+ @-erase $(CPP_OBJS) 2>NUL
+
+local_distclean:
+
+local_test:
diff --git a/tests/url/Makefile.W32 b/tests/url/Makefile.W32
index c2d74e7..573f636 100755
--- a/tests/url/Makefile.W32
+++ b/tests/url/Makefile.W32
@@ -5,26 +5,28 @@ SUBDIRS =
!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
INCLUDE_CXXFLAGS = \
- /D_WIN32_WINNT=0x504
+ /D_WIN32_WINNT=0x504 \
+ /DUSE_MODULELOADER
INCLUDE_DIRS = \
/I. \
- /I$(TOPDIR)\src
+ /I$(TOPDIR)\src \
+ /I$(TOPDIR)\src\modules\urlnormalizer\simpleurl
INCLUDE_LDFLAGS = \
INCLUDE_LIBS = \
+ $(TOPDIR)\src\crawlingwolf.lib \
+ $(TOPDIR)\src\modules\urlnormalizer\simpleurl\simpleurlnormalizer.lib
TEST_CPP_BINS = \
- test1.exe \
- test2.exe
+ test1.exe
OBJS =
!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
test1.exe: test1.obj
-test2.exe: test2.obj
local_all:
@@ -34,5 +36,5 @@ local_clean:
local_distclean:
local_test:
- @-exec_test test1 "TypeList and TypeTraits"
- @-exec_test test2 "TypeInfo C++ demangle"
+# @-exec_test test1 "TypeList and TypeTraits"
+# @-exec_test test2 "TypeInfo C++ demangle"
diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp
index b2ee90e..095de24 100644..100755
--- a/tests/url/test1.cpp
+++ b/tests/url/test1.cpp
@@ -4,7 +4,7 @@
#include "ModuleLoader.hpp"
#else
#include "SimpleURLNormalizer.hpp"
-#include "GoogleURLNormalizer.hpp"
+//#include "GoogleURLNormalizer.hpp"
#endif
#include <vector>
@@ -16,71 +16,76 @@ using namespace std;
int main( int argc, char *argv[] )
{
- if( argc < 3 ) {
- cerr << "usage: test1 <method> <action> <baseUrl> [<relativeUrl>]\n" << endl;
- return 1;
- }
-
- char *method = argv[1];
- char *action = argv[2];
- char *baseUrlString = argv[3];
- char *partialUrlString = argv[4];
+ try {
+ if( argc < 3 ) {
+ cerr << "usage: test1 <method> <action> <baseUrl> [<relativeUrl>]\n" << endl;
+ return 1;
+ }
+
+ char *method = argv[1];
+ char *action = argv[2];
+ char *baseUrlString = argv[3];
+ char *partialUrlString = argv[4];
#ifdef USE_MODULELOADER
- vector<string> modules;
- modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
- modules.push_back( "../../src/modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
- ModuleLoader<URLNormalizer> urlNormalizers( modules );
+ vector<string> modules;
+ // modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple" MODULE_EXT );
+ modules.push_back( "..\\..\\src\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple" MODULE_EXT );
+ // modules.push_back( "../../src/modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl" MODULE_EXT );
+ ModuleLoader<URLNormalizer> urlNormalizers( modules );
- URLNormalizer *normalizer = urlNormalizers.create( method );
+ URLNormalizer *normalizer = urlNormalizers.create( method );
#else
- URLNormalizer *normalizer;
- if( strcmp( method, "simple" ) == 0 ) {
- normalizer = new SimpleURLNormalizer( );
- } else if( strcmp( method, "google" ) == 0 ) {
- normalizer = new GoogleURLNormalizer( );
- } else {
- cerr << "Unknown normalization method '" << method << "'" << endl;
- return 1;
- }
+ URLNormalizer *normalizer;
+ if( strcmp( method, "simple" ) == 0 ) {
+ normalizer = new SimpleURLNormalizer( );
+ // } else if( strcmp( method, "google" ) == 0 ) {
+ // normalizer = new GoogleURLNormalizer( );
+ } else {
+ cerr << "Unknown normalization method '" << method << "'" << endl;
+ return 1;
+ }
#endif
-
-
- URL url;
-
- if( strcmp( action, "parse" ) == 0 ) {
- url = normalizer->parseUrl( baseUrlString );
- } else if( strcmp( action, "normalize" ) == 0 ) {
- URL baseUrl = normalizer->parseUrl( baseUrlString );
- if( baseUrl == URL::Null ) {
- cerr << "Illegal base URL!" << endl;
+
+ URL url;
+
+ if( strcmp( action, "parse" ) == 0 ) {
+ url = normalizer->parseUrl( baseUrlString );
+ } else if( strcmp( action, "normalize" ) == 0 ) {
+ URL baseUrl = normalizer->parseUrl( baseUrlString );
+ if( baseUrl == URL::Null ) {
+ cerr << "Illegal base URL!" << endl;
+ return 1;
+ }
+ url = normalizer->normalize( baseUrl, partialUrlString );
+ } else {
+ cerr << "Unknown action '" << action << "'" << endl;
return 1;
}
- url = normalizer->normalize( baseUrl, partialUrlString );
- } else {
- cerr << "Unknown action '" << action << "'" << endl;
- return 1;
- }
- if( url == URL::Null ) {
- cerr << "Illegal URL!" << endl;
- return 1;
- }
-
- cout << "protocol: " << url.protocol( ) << endl
- << "host: " << url.host( ) << endl
- << "port: " << url.port( ) << endl
- << "path: " << url.path( ) << endl
- << "query: " << url.query( ) << endl
- << "fragment: " << url.fragment( ) << endl;
+ if( url == URL::Null ) {
+ cerr << "Illegal URL!" << endl;
+ return 1;
+ }
+
+ cout << "protocol: " << url.protocol( ) << endl
+ << "host: " << url.host( ) << endl
+ << "port: " << url.port( ) << endl
+ << "path: " << url.path( ) << endl
+ << "query: " << url.query( ) << endl
+ << "fragment: " << url.fragment( ) << endl;
- cout << "URL: " << url << endl;
+ cout << "URL: " << url << endl;
#ifdef USE_MODULELOADER
- urlNormalizers.destroy( normalizer );
+ urlNormalizers.destroy( normalizer );
#else
- delete normalizer;
+ delete normalizer;
#endif
- return 0;
+ return 0;
+ } catch( exception &e ) {
+ cerr << e.what( ) << endl;
+ return 1;
+ }
}