summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/LINKS1
-rw-r--r--src/GNUmakefile4
-rw-r--r--src/ModuleLoader.hpp90
-rw-r--r--src/ModuleRegistry.hpp2
-rw-r--r--src/crawlingwolf.cpp39
-rw-r--r--src/modules/urlfilter/GNUmakefile3
-rw-r--r--src/modules/urlfilter/chain/ChainURLFilter.cpp27
-rw-r--r--src/modules/urlfilter/chain/ChainURLFilter.hpp8
8 files changed, 101 insertions, 73 deletions
diff --git a/docs/LINKS b/docs/LINKS
index dd5f925..ac7e0c3 100644
--- a/docs/LINKS
+++ b/docs/LINKS
@@ -53,3 +53,4 @@ http://sourceforge.net/projects/toast/: portable type_info.name()
http://gcc.gnu.org/onlinedocs/libstdc++/manual/ext_demangling.html
http://tombarta.wordpress.com/category/gcc/
?? name of module or typeid of derived class in module?
+http://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Name-lookup.html#Name-lookup
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 906f3ea..2b7c38e 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -39,8 +39,8 @@ CPP_OBJS = \
LOCAL_STATIC_LIB = \
libcrawlingwolf.a
-#CPP_BINS = \
-# crawlingwolf$(EXE)
+CPP_BINS = \
+ crawlingwolf$(EXE)
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp
index 1a89d38..524a0e1 100644
--- a/src/ModuleLoader.hpp
+++ b/src/ModuleLoader.hpp
@@ -14,29 +14,28 @@
#include "TypeList.hpp"
#include "TypeInfo.hpp"
-template< typename Interface >
+template< typename Interface, typename CtorParams = NullType >
struct Module {
void *handle;
- ModuleRegistry<Interface> *registry;
+ ModuleRegistry< Interface, CtorParams > *registry;
};
-template< typename Interface >
-class ModuleLoader {
+template< typename Interface, typename CtorParams = NullType >
+class BaseModuleLoader {
+
+ public:
- typedef typename std::map<std::string, Module< Interface > > mapType;
+ typedef typename std::map<std::string, Module< Interface, CtorParams > > mapType;
- protected:
+ protected:
+
mapType m_modules;
-
+
public:
-
- ModuleLoader<Interface>( )
- {
- }
-
- ModuleLoader<Interface>( const std::vector<std::string> files )
+
+ BaseModuleLoader( const std::vector<std::string> files )
{
- Module<Interface> m;
+ Module< Interface, CtorParams> m;
for( std::vector<string>::const_iterator it = files.begin( ); it != files.end( ); it++ ) {
m.handle = dlopen( it->c_str( ), RTLD_NOW );
@@ -46,7 +45,7 @@ class ModuleLoader {
std::string registryName = "registry_" + demangle( typeid( Interface ) );
- m.registry = static_cast<ModuleRegistry<Interface> *>( dlsym( m.handle, registryName.c_str( ) ) );
+ m.registry = static_cast< ModuleRegistry< Interface, CtorParams > *>( dlsym( m.handle, registryName.c_str( ) ) );
if( !m.registry ) {
dlclose( m.handle );
throw std::runtime_error( "missing module registry" );
@@ -56,7 +55,7 @@ class ModuleLoader {
}
}
- ~ModuleLoader<Interface>( )
+ virtual ~BaseModuleLoader< Interface, CtorParams >( )
{
for( typename mapType::iterator it = m_modules.begin( ); it != m_modules.end( ); it++ ) {
if( (*it).second.handle ) {
@@ -65,11 +64,35 @@ class ModuleLoader {
}
}
}
-
- Interface *create( std::string subclass )
+
+ void destroy( Interface *obj )
{
- typename mapType::const_iterator it = m_modules.find( subclass );
+ std::string clazz = demangle( typeid( *obj ) );
+
+ typename mapType::const_iterator it = m_modules.find( clazz );
if( it == m_modules.end( ) ) {
+ throw std::runtime_error( "calling unknown destructor" );
+ }
+
+ (*it).second.registry->destroy( obj );
+ }
+};
+
+template< typename Interface, typename CtorParams = NullType >
+class ModuleLoader;
+
+template< typename Interface >
+class ModuleLoader< Interface, NullType > : public BaseModuleLoader< Interface, NullType >
+{
+ public:
+
+ ModuleLoader< Interface >( const std::vector<std::string> files )
+ : BaseModuleLoader< Interface >(files ) { }
+
+ Interface *create( std::string subclass )
+ {
+ typename BaseModuleLoader< Interface >::mapType::const_iterator it = BaseModuleLoader< Interface >::m_modules.find( subclass );
+ if( it == BaseModuleLoader< Interface >::m_modules.end( ) ) {
throw std::runtime_error( "calling unknown constructor" );
}
@@ -77,21 +100,34 @@ class ModuleLoader {
std::string clazz = demangle( typeid( *obj ) );
- m_modules.insert( std::make_pair( clazz, (*it).second ) );
+ BaseModuleLoader< Interface >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
return obj;
}
-
- void destroy( Interface *obj )
+};
+
+template< typename Interface, typename P1 >
+class ModuleLoader< Interface, TYPELIST_1( P1 ) > : public BaseModuleLoader< Interface, TYPELIST_1( P1 ) >
+{
+ public:
+
+ ModuleLoader< Interface, TYPELIST_1( P1 ) >( const std::vector<std::string> files )
+ : BaseModuleLoader< Interface, TYPELIST_1( P1 ) >( files ) { }
+
+ Interface *create( std::string subclass, P1 p1 )
{
+ typename BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.find( subclass );
+ if( it == BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.end( ) ) {
+ throw std::runtime_error( "calling unknown constructor" );
+ }
+
+ Interface *obj = (*it).second.registry->create( p1 );
+
std::string clazz = demangle( typeid( *obj ) );
- typename mapType::const_iterator it = m_modules.find( clazz );
- if( it == m_modules.end( ) ) {
- throw std::runtime_error( "calling unknown destructor" );
- }
+ BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
- (*it).second.registry->destroy( obj );
+ return obj;
}
};
diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp
index 3b7e5d9..055cb1d 100644
--- a/src/ModuleRegistry.hpp
+++ b/src/ModuleRegistry.hpp
@@ -8,7 +8,7 @@
template< typename Interface, typename CtorParams = NullType >
struct ModuleRegistry;
-template< typename Interface >
+template< typename Interface>
struct ModuleRegistry< Interface > {
std::string name;
Interface *(*create)( );
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index e924b16..1c0576f 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -4,10 +4,12 @@
#include "HTMLLinkExtractProcessor.hpp"
#include "MemoryURLSeen.hpp"
#include "URLNormalizer.hpp"
+#include "URLFilter.hpp"
#include "ModuleLoader.hpp"
#include <set>
#include <vector>
+#include <list>
using namespace std;
@@ -15,32 +17,42 @@ int main( void )
{
FILELog::reportingLevel( ) = logINFO;
- vector<string> modules;
- modules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
- modules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
- ModuleLoader<URLNormalizer> urlNormalizers( modules );
+ vector<string> normalizerModules;
+ normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
+ normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
+ ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules );
+
+ vector<string> filterModules;
+ filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" );
+ filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" );
+ ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules );
+
+ vector<string> filterChainModules;
+ filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" );
+ ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules );
Frontier *frontier = new MemoryFrontier( );
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
URLSeen *urlSeen = new MemoryURLSeen( );
-/*
- set<string> protocols;
+ set<string> protocols;
protocols.insert( "http" );
protocols.insert( "https" );
- ProtocolURLFilter protocolFilter( protocols );
+ URLFilter *protocolFilter = urlFilters.create( "protocol", protocols );
set<string> hosts;
hosts.insert( "www.andreasbaumann.cc" );
- HostURLFilter hostFilter( hosts );
+ URLFilter *hostFilter = urlFilters.create( "host", hosts );
- ChainURLFilter filters( &protocolFilter, &hostFilter );
-*/
+ list<URLFilter *> filters;
+ filters.push_back( hostFilter );
+ filters.push_back( protocolFilter );
+ URLFilter *chainFilter = urlChainFilter.create( "chain", filters );
+
URLNormalizer *normalizer = urlNormalizers.create( "google" );
- //Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen );
- Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, std::list( ), urlSeen );
+ Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, chainFilter, urlSeen );
LOG( logNOTICE ) << "Crawler started..";
@@ -64,6 +76,9 @@ int main( void )
delete processor;
urlNormalizers.destroy( normalizer );
+ urlChainFilter.destroy( chainFilter );
+ urlFilters.destroy( protocolFilter );
+ urlFilters.destroy( hostFilter );
delete urlSeen;
delete deduper;
delete fetcher;
diff --git a/src/modules/urlfilter/GNUmakefile b/src/modules/urlfilter/GNUmakefile
index ea5262d..9909f8d 100644
--- a/src/modules/urlfilter/GNUmakefile
+++ b/src/modules/urlfilter/GNUmakefile
@@ -1,7 +1,6 @@
TOPDIR = ../../..
-SUBDIRS = protocol host
-#chain
+SUBDIRS = protocol host chain
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/urlfilter/chain/ChainURLFilter.cpp b/src/modules/urlfilter/chain/ChainURLFilter.cpp
index a356299..fc2de93 100644
--- a/src/modules/urlfilter/chain/ChainURLFilter.cpp
+++ b/src/modules/urlfilter/chain/ChainURLFilter.cpp
@@ -1,30 +1,9 @@
#include "ChainURLFilter.hpp"
-ChainURLFilter::ChainURLFilter( )
- : m_filters( )
+ChainURLFilter::ChainURLFilter( const std::list< URLFilter * > filters )
+ : m_filters( filters )
{
}
-
-ChainURLFilter::ChainURLFilter( URLFilter *f1 )
- : m_filters( )
-{
- m_filters.push_back( f1 );
-}
-
-ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2 )
- : m_filters( )
-{
- m_filters.push_back( f1 );
- m_filters.push_back( f2 );
-}
-
-ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 )
- : m_filters( )
-{
- m_filters.push_back( f1 );
- m_filters.push_back( f2 );
- m_filters.push_back( f3 );
-}
bool ChainURLFilter::filter( const URL url )
{
@@ -36,3 +15,5 @@ bool ChainURLFilter::filter( const URL url )
return true;
}
+
+REGISTER_MODULE_1( "chain", URLFilter, ChainURLFilter, const std::list<URLFilter *> )
diff --git a/src/modules/urlfilter/chain/ChainURLFilter.hpp b/src/modules/urlfilter/chain/ChainURLFilter.hpp
index 8c6d165..966e5cc 100644
--- a/src/modules/urlfilter/chain/ChainURLFilter.hpp
+++ b/src/modules/urlfilter/chain/ChainURLFilter.hpp
@@ -9,10 +9,7 @@
class ChainURLFilter : public URLFilter
{
public:
- ChainURLFilter( );
- ChainURLFilter( URLFilter *f1 );
- ChainURLFilter( URLFilter *f1, URLFilter *f2 );
- ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 );
+ ChainURLFilter( const std::list< URLFilter * > );
virtual bool filter( const URL url );
@@ -20,7 +17,6 @@ class ChainURLFilter : public URLFilter
std::list<URLFilter *> m_filters;
};
-DECLARE_MODULE( URLFilter )
-DECLARE_MODULE_1( URLFilter, URLFilter * )
+DECLARE_MODULE_1( URLFilter, const std::list<URLFilter *> )
#endif