summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-07 23:25:56 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-07 23:25:56 +0200
commit8a2cab809aaa0b0eb8ec65082344468d1deac1ca (patch)
treebcc304674d1e2b0a970883206a75175e1ebf6430
parent6586912d3a24e3f7a361c65f56f530b2241f5029 (diff)
downloadcrawler-8a2cab809aaa0b0eb8ec65082344468d1deac1ca.tar.gz
crawler-8a2cab809aaa0b0eb8ec65082344468d1deac1ca.tar.bz2
started modularization of URL filters
better registry function for loading the module (base class as signature) started to support variable arguments for registry create/constructor (work in progress) playing with some Alexandrescu idions :-)
-rw-r--r--docs/LINKS10
-rw-r--r--src/GNUmakefile7
-rw-r--r--src/ModuleLoader.hpp11
-rw-r--r--src/ModuleRegistry.hpp64
-rw-r--r--src/TypeInfo.hpp33
-rw-r--r--src/TypeList.hpp28
-rw-r--r--src/TypeTraits.hpp9
-rw-r--r--src/crawlingwolf.cpp9
-rw-r--r--src/modules/GNUmakefile2
-rw-r--r--src/modules/urlfilter/GNUmakefile18
-rw-r--r--src/modules/urlfilter/chain/ChainURLFilter.cpp (renamed from src/ChainURLFilter.cpp)0
-rw-r--r--src/modules/urlfilter/chain/ChainURLFilter.hpp (renamed from src/ChainURLFilter.hpp)4
-rw-r--r--src/modules/urlfilter/chain/GNUmakefile39
-rw-r--r--src/modules/urlfilter/host/GNUmakefile39
-rw-r--r--src/modules/urlfilter/host/HostURLFilter.cpp (renamed from src/HostURLFilter.cpp)2
-rw-r--r--src/modules/urlfilter/host/HostURLFilter.hpp (renamed from src/HostURLFilter.hpp)4
-rw-r--r--src/modules/urlfilter/protocol/GNUmakefile39
-rw-r--r--src/modules/urlfilter/protocol/ProtocolURLFilter.cpp (renamed from src/ProtocolURLFilter.cpp)2
-rw-r--r--src/modules/urlfilter/protocol/ProtocolURLFilter.hpp (renamed from src/ProtocolURLFilter.hpp)4
-rw-r--r--tests/GNUmakefile2
-rw-r--r--tests/url/GNUmakefile4
-rw-r--r--tests/utils/GNUmakefile25
-rw-r--r--tests/utils/test1.cpp16
23 files changed, 340 insertions, 31 deletions
diff --git a/docs/LINKS b/docs/LINKS
index 273f23d..7a48586 100644
--- a/docs/LINKS
+++ b/docs/LINKS
@@ -42,3 +42,13 @@ http://www.isotton.com/devel/docs/C++-dlopen-mini-HOWTO/C++-dlopen-mini-HOWTO.ht
http://www.linuxjournal.com/article/3687?page=0,1
http://www.artima.com/cppsource/subscription_problem.html
http://kristiannielsen.livejournal.com/11783.html
+
+Meta Programming in C++
+
+Model C++ Design (Alexandrescu)
+The Loki Template library
+http://www.codeproject.com/Articles/5629/Tiny-Template-Library-implementing-typelist
+http://www.drdobbs.com/cpp/extracting-function-parameter-and-return/240000586?pgno=2
+http://sourceforge.net/projects/toast/: portable type_info.name()
+http://gcc.gnu.org/onlinedocs/libstdc++/manual/ext_demangling.html
+?? name of module or typeid of derived class in module?
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 4948b49..906f3ea 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -31,9 +31,6 @@ LOCAL_STATIC_LIB_OBJS = \
LibFetchFetcher.o \
LibFetchRewindInputStream.o \
HTMLLinkExtractProcessor.o \
- ProtocolURLFilter.o \
- HostURLFilter.o \
- ChainURLFilter.o \
MemoryURLSeen.o
CPP_OBJS = \
@@ -42,8 +39,8 @@ CPP_OBJS = \
LOCAL_STATIC_LIB = \
libcrawlingwolf.a
-CPP_BINS = \
- crawlingwolf$(EXE)
+#CPP_BINS = \
+# crawlingwolf$(EXE)
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp
index 2c88ed6..1a89d38 100644
--- a/src/ModuleLoader.hpp
+++ b/src/ModuleLoader.hpp
@@ -11,6 +11,9 @@
#include "ModuleRegistry.hpp"
+#include "TypeList.hpp"
+#include "TypeInfo.hpp"
+
template< typename Interface >
struct Module {
void *handle;
@@ -41,7 +44,9 @@ class ModuleLoader {
throw std::runtime_error( dlerror( ) );
}
- m.registry = static_cast<ModuleRegistry<Interface> *>( dlsym( m.handle, "registry" ) );
+ std::string registryName = "registry_" + demangle( typeid( Interface ) );
+
+ m.registry = static_cast<ModuleRegistry<Interface> *>( dlsym( m.handle, registryName.c_str( ) ) );
if( !m.registry ) {
dlclose( m.handle );
throw std::runtime_error( "missing module registry" );
@@ -70,7 +75,7 @@ class ModuleLoader {
Interface *obj = (*it).second.registry->create( );
- std::string clazz = typeid( *obj ).name( );
+ std::string clazz = demangle( typeid( *obj ) );
m_modules.insert( std::make_pair( clazz, (*it).second ) );
@@ -79,7 +84,7 @@ class ModuleLoader {
void destroy( Interface *obj )
{
- std::string clazz = typeid( *obj ).name( );
+ std::string clazz = demangle( typeid( *obj ) );
typename mapType::const_iterator it = m_modules.find( clazz );
if( it == m_modules.end( ) ) {
diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp
index 403bb22..3b7e5d9 100644
--- a/src/ModuleRegistry.hpp
+++ b/src/ModuleRegistry.hpp
@@ -3,29 +3,42 @@
#include <string>
+#include "TypeList.hpp"
+
+template< typename Interface, typename CtorParams = NullType >
+struct ModuleRegistry;
+
template< typename Interface >
-struct ModuleRegistry {
+struct ModuleRegistry< Interface > {
std::string name;
Interface *(*create)( );
void (*destroy)( Interface *obj );
- ModuleRegistry( ) { }
-
- ModuleRegistry<Interface>( std::string _name, Interface *(*_create)( ),
+ ModuleRegistry( std::string _name, Interface *(*_create)( ),
void (*_destroy)( Interface *obj ) )
: name( _name ), create( _create ), destroy( _destroy )
{
}
};
+template< typename Interface, typename P1 >
+struct ModuleRegistry< Interface, TYPELIST_1( P1 ) > {
+ std::string name;
+ Interface *(*create)( P1 );
+ void (*destroy)( Interface *obj );
+
+ ModuleRegistry( std::string _name, Interface *(*_create)( P1 ),
+ void (*_destroy)( Interface *obj ) )
+ : name( _name ), create( _create ), destroy( _destroy )
+ {
+ }
+};
+
#ifdef SHARED
+
#define DECLARE_MODULE( baseClass ) \
-extern "C" ModuleRegistry<baseClass> registry;
-#else
-#define DECLARE_MODULE( baseClass )
-#endif
+ extern ModuleRegistry<baseClass> registry ## _ ## baseClass;
-#ifdef SHARED
#define REGISTER_MODULE( name, baseClass, subClass ) \
static baseClass *create( ) \
{ \
@@ -37,9 +50,32 @@ static void destroy( baseClass *obj ) \
delete obj; \
} \
\
-ModuleRegistry<baseClass> registry( name, &create, &destroy );
-#else
+ModuleRegistry<baseClass> registry ## _ ## baseClass( name, &create, &destroy );
+
+#define DECLARE_MODULE_1( baseClass, T1 ) \
+ extern ModuleRegistry<baseClass, TYPELIST_1( T1 ) > registry ## _ ## baseClass;
+
+#define REGISTER_MODULE_1( name, baseClass, subClass, T1 ) \
+static baseClass *create( T1 t ) \
+{ \
+ return new subClass( t ); \
+} \
+ \
+static void destroy( baseClass *obj ) \
+{ \
+ delete obj; \
+} \
+ \
+ModuleRegistry<baseClass, TYPELIST_1( T1 )> registry ## _ ## baseClass( name, &create, &destroy );
+
+#else // SHARED
+
+#define DECLARE_MODULE( baseClass )
+#define DECLARE_MODULE_1( baseClass, T )
+
#define REGISTER_MODULE( name, baseClass, subClass )
-#endif
-
-#endif
+#define REGISTER_MODULE_1( name, baseClass, subClass, T )
+
+#endif // SHARED
+
+#endif // __MODULEINTERFACE_H
diff --git a/src/TypeInfo.hpp b/src/TypeInfo.hpp
new file mode 100644
index 0000000..4133ec3
--- /dev/null
+++ b/src/TypeInfo.hpp
@@ -0,0 +1,33 @@
+#ifndef __TYPEINFO_H
+#define __TYPEINFO_H
+
+#include <typeinfo>
+#include <string>
+#include <stdexcept>
+
+#ifdef __GNUG__
+
+#include <cxxabi.h>
+
+std::string demangle( const std::type_info &info )
+{
+ enum { BUFLEN = 200 };
+ char buf[BUFLEN];
+ std::size_t buflen = BUFLEN;
+ int status;
+
+ __cxxabiv1::__cxa_demangle( info.name( ), buf, &buflen, &status );
+ if( status != 0 ) {
+ throw std::runtime_error( "__cxa_demangle failed!" );
+ }
+
+ return buf;
+}
+
+#else
+
+#error "C++ demangling not ported!"
+
+#endif
+
+#endif
diff --git a/src/TypeList.hpp b/src/TypeList.hpp
new file mode 100644
index 0000000..bc8c49b
--- /dev/null
+++ b/src/TypeList.hpp
@@ -0,0 +1,28 @@
+#ifndef __TYPELIST_H
+#define __TYPELIST_H
+
+class NullType {};
+
+template< class T, class U >
+struct TypeList {
+ typedef T Head;
+ typedef U Tail;
+};
+
+#define TYPELIST_1( T1 ) TypeList< T1, NullType >
+#define TYPELIST_2( T1, T2 ) TypeList< T1, TYPELIST_1( T2 ) >
+#define TYPELIST_3( T1, T2, T3 ) TypeList< T1, TYPELIST_2( T2, T3 ) >
+#define TYPELIST_4( T1, T2, T3, T4 ) TypeList< T1, TYPELIST_3( T2, T3, T4 ) >
+
+template< class T> struct Length;
+template< > struct Length< NullType >
+{
+ enum { value = 0 };
+};
+template< class T, class U >
+struct Length< TypeList< T, U > >
+{
+ enum { value = 1 + Length< U >::value };
+};
+
+#endif
diff --git a/src/TypeTraits.hpp b/src/TypeTraits.hpp
new file mode 100644
index 0000000..b01051e
--- /dev/null
+++ b/src/TypeTraits.hpp
@@ -0,0 +1,9 @@
+#ifndef __TYPETRAITS_H
+#define __TYPETRAITS_H
+
+template< typename T >
+class TypeTraits {
+ typedef typename
+};
+
+#endif
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 328cc80..e924b16 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -2,9 +2,6 @@
#include "MemoryFrontier.hpp"
#include "MD5Deduper.hpp"
#include "HTMLLinkExtractProcessor.hpp"
-#include "ChainURLFilter.hpp"
-#include "ProtocolURLFilter.hpp"
-#include "HostURLFilter.hpp"
#include "MemoryURLSeen.hpp"
#include "URLNormalizer.hpp"
#include "ModuleLoader.hpp"
@@ -28,6 +25,7 @@ int main( void )
Deduper *deduper = new MD5Deduper( );
URLSeen *urlSeen = new MemoryURLSeen( );
+/*
set<string> protocols;
protocols.insert( "http" );
protocols.insert( "https" );
@@ -38,10 +36,11 @@ int main( void )
HostURLFilter hostFilter( hosts );
ChainURLFilter filters( &protocolFilter, &hostFilter );
-
+*/
URLNormalizer *normalizer = urlNormalizers.create( "google" );
- Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen );
+ //Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen );
+ Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, std::list( ), urlSeen );
LOG( logNOTICE ) << "Crawler started..";
diff --git a/src/modules/GNUmakefile b/src/modules/GNUmakefile
index ddf5ee4..cd45705 100644
--- a/src/modules/GNUmakefile
+++ b/src/modules/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ../..
-SUBDIRS = urlnormalizer
+SUBDIRS = urlnormalizer urlfilter
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/urlfilter/GNUmakefile b/src/modules/urlfilter/GNUmakefile
new file mode 100644
index 0000000..ea5262d
--- /dev/null
+++ b/src/modules/urlfilter/GNUmakefile
@@ -0,0 +1,18 @@
+TOPDIR = ../../..
+
+SUBDIRS = protocol host
+#chain
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/ChainURLFilter.cpp b/src/modules/urlfilter/chain/ChainURLFilter.cpp
index a356299..a356299 100644
--- a/src/ChainURLFilter.cpp
+++ b/src/modules/urlfilter/chain/ChainURLFilter.cpp
diff --git a/src/ChainURLFilter.hpp b/src/modules/urlfilter/chain/ChainURLFilter.hpp
index 216af6b..8c6d165 100644
--- a/src/ChainURLFilter.hpp
+++ b/src/modules/urlfilter/chain/ChainURLFilter.hpp
@@ -2,6 +2,7 @@
#define __CHAIN_URLFILTER_H
#include "URLFilter.hpp"
+#include "ModuleRegistry.hpp"
#include <list>
@@ -19,4 +20,7 @@ class ChainURLFilter : public URLFilter
std::list<URLFilter *> m_filters;
};
+DECLARE_MODULE( URLFilter )
+DECLARE_MODULE_1( URLFilter, URLFilter * )
+
#endif
diff --git a/src/modules/urlfilter/chain/GNUmakefile b/src/modules/urlfilter/chain/GNUmakefile
new file mode 100644
index 0000000..5b7c827
--- /dev/null
+++ b/src/modules/urlfilter/chain/GNUmakefile
@@ -0,0 +1,39 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a
+
+DYNAMIC_MODULE = \
+ mod_urlfilter_chain.so
+
+STATIC_LIB = \
+ libchainurlfilter.a
+
+CPP_OBJS = \
+ ChainURLFilter.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/urlfilter/host/GNUmakefile b/src/modules/urlfilter/host/GNUmakefile
new file mode 100644
index 0000000..beff685
--- /dev/null
+++ b/src/modules/urlfilter/host/GNUmakefile
@@ -0,0 +1,39 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a
+
+DYNAMIC_MODULE = \
+ mod_urlfilter_host.so
+
+STATIC_LIB = \
+ libhosturlfilter.a
+
+CPP_OBJS = \
+ HostURLFilter.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/HostURLFilter.cpp b/src/modules/urlfilter/host/HostURLFilter.cpp
index 3c3686f..6981a36 100644
--- a/src/HostURLFilter.cpp
+++ b/src/modules/urlfilter/host/HostURLFilter.cpp
@@ -17,3 +17,5 @@ bool HostURLFilter::filter( const URL url )
return res;
}
+
+REGISTER_MODULE_1( "host", URLFilter, HostURLFilter, const std::set<std::string> )
diff --git a/src/HostURLFilter.hpp b/src/modules/urlfilter/host/HostURLFilter.hpp
index aa91e09..6d1349e 100644
--- a/src/HostURLFilter.hpp
+++ b/src/modules/urlfilter/host/HostURLFilter.hpp
@@ -2,8 +2,10 @@
#define __HOST_URLFILTER_H
#include "URLFilter.hpp"
+#include "ModuleRegistry.hpp"
#include <set>
+#include <string>
class HostURLFilter : public URLFilter
{
@@ -16,4 +18,6 @@ class HostURLFilter : public URLFilter
std::set<std::string> m_hosts;
};
+DECLARE_MODULE_1( URLFilter, const std::set<std::string> )
+
#endif
diff --git a/src/modules/urlfilter/protocol/GNUmakefile b/src/modules/urlfilter/protocol/GNUmakefile
new file mode 100644
index 0000000..52027bc
--- /dev/null
+++ b/src/modules/urlfilter/protocol/GNUmakefile
@@ -0,0 +1,39 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a
+
+DYNAMIC_MODULE = \
+ mod_urlfilter_protocol.so
+
+STATIC_LIB = \
+ libprotocolurlfilter.a
+
+CPP_OBJS = \
+ ProtocolURLFilter.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/ProtocolURLFilter.cpp b/src/modules/urlfilter/protocol/ProtocolURLFilter.cpp
index 3f495ed..e50dcc1 100644
--- a/src/ProtocolURLFilter.cpp
+++ b/src/modules/urlfilter/protocol/ProtocolURLFilter.cpp
@@ -17,3 +17,5 @@ bool ProtocolURLFilter::filter( const URL url )
return res;
}
+
+REGISTER_MODULE_1( "protocol", URLFilter, ProtocolURLFilter, const std::set<std::string> )
diff --git a/src/ProtocolURLFilter.hpp b/src/modules/urlfilter/protocol/ProtocolURLFilter.hpp
index 3fe18f8..b829e61 100644
--- a/src/ProtocolURLFilter.hpp
+++ b/src/modules/urlfilter/protocol/ProtocolURLFilter.hpp
@@ -2,8 +2,10 @@
#define __PROTOCOL_URLFILTER_H
#include "URLFilter.hpp"
+#include "ModuleRegistry.hpp"
#include <set>
+#include <string>
class ProtocolURLFilter : public URLFilter
{
@@ -16,4 +18,6 @@ class ProtocolURLFilter : public URLFilter
std::set<std::string> m_protocols;
};
+DECLARE_MODULE_1( URLFilter, const std::set<std::string> )
+
#endif
diff --git a/tests/GNUmakefile b/tests/GNUmakefile
index f582bbb..e2b08bb 100644
--- a/tests/GNUmakefile
+++ b/tests/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ..
-SUBDIRS = url streamhtmlparser libfetch curl psql sqlite
+SUBDIRS = utils url streamhtmlparser libfetch curl psql sqlite
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile
index 6a9104a..6ca1f96 100644
--- a/tests/url/GNUmakefile
+++ b/tests/url/GNUmakefile
@@ -2,8 +2,8 @@ TOPDIR = ../..
SUBDIRS =
-#INCLUDE_CXXFLAGS = \
-# -DUSE_MODULELOADER
+INCLUDE_CXXFLAGS = \
+ -DUSE_MODULELOADER
INCLUDE_DIRS = \
-I$(TOPDIR)/src \
diff --git a/tests/utils/GNUmakefile b/tests/utils/GNUmakefile
new file mode 100644
index 0000000..e3913bd
--- /dev/null
+++ b/tests/utils/GNUmakefile
@@ -0,0 +1,25 @@
+TOPDIR = ../..
+
+SUBDIRS =
+
+INCLUDE_DIRS = \
+ -I$(TOPDIR)/src
+
+INCLUDE_LDFLAGS =
+
+INCLUDE_LIBS =
+
+TEST_CPP_BINS = \
+ test1$(EXE)
+
+OBJS =
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_test:
diff --git a/tests/utils/test1.cpp b/tests/utils/test1.cpp
new file mode 100644
index 0000000..987149b
--- /dev/null
+++ b/tests/utils/test1.cpp
@@ -0,0 +1,16 @@
+#include "TypeList.hpp"
+
+#include <iostream>
+using namespace std;
+
+typedef TypeList< int, TypeList< char *, TypeList< int, NullType > > > TestType;
+
+typedef TYPELIST_3( int, char *, int ) TestType2;
+
+int len = Length<TestType2>::value;
+
+int main( void )
+{
+ cout << "len: " << len << endl;
+ return 0;
+}