From d560a4920cdd16904a71640e0cfc380ea34e9e6f Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Wed, 8 Aug 2012 15:26:45 +0200 Subject: chain filter and modules with one ctor param work now --- docs/LINKS | 1 + src/GNUmakefile | 4 +- src/ModuleLoader.hpp | 90 ++++++++++++++++++-------- src/ModuleRegistry.hpp | 2 +- src/crawlingwolf.cpp | 39 +++++++---- src/modules/urlfilter/GNUmakefile | 3 +- src/modules/urlfilter/chain/ChainURLFilter.cpp | 27 ++------ src/modules/urlfilter/chain/ChainURLFilter.hpp | 8 +-- 8 files changed, 101 insertions(+), 73 deletions(-) diff --git a/docs/LINKS b/docs/LINKS index dd5f925..ac7e0c3 100644 --- a/docs/LINKS +++ b/docs/LINKS @@ -53,3 +53,4 @@ http://sourceforge.net/projects/toast/: portable type_info.name() http://gcc.gnu.org/onlinedocs/libstdc++/manual/ext_demangling.html http://tombarta.wordpress.com/category/gcc/ ?? name of module or typeid of derived class in module? +http://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Name-lookup.html#Name-lookup diff --git a/src/GNUmakefile b/src/GNUmakefile index 906f3ea..2b7c38e 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -39,8 +39,8 @@ CPP_OBJS = \ LOCAL_STATIC_LIB = \ libcrawlingwolf.a -#CPP_BINS = \ -# crawlingwolf$(EXE) +CPP_BINS = \ + crawlingwolf$(EXE) -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp index 1a89d38..524a0e1 100644 --- a/src/ModuleLoader.hpp +++ b/src/ModuleLoader.hpp @@ -14,29 +14,28 @@ #include "TypeList.hpp" #include "TypeInfo.hpp" -template< typename Interface > +template< typename Interface, typename CtorParams = NullType > struct Module { void *handle; - ModuleRegistry *registry; + ModuleRegistry< Interface, CtorParams > *registry; }; -template< typename Interface > -class ModuleLoader { +template< typename Interface, typename CtorParams = NullType > +class BaseModuleLoader { + + public: - typedef typename std::map > mapType; + typedef typename std::map > mapType; - protected: + protected: + mapType m_modules; - + public: - - ModuleLoader( ) - { - } - - ModuleLoader( const std::vector files ) + + BaseModuleLoader( const std::vector files ) { - Module m; + Module< Interface, CtorParams> m; for( std::vector::const_iterator it = files.begin( ); it != files.end( ); it++ ) { m.handle = dlopen( it->c_str( ), RTLD_NOW ); @@ -46,7 +45,7 @@ class ModuleLoader { std::string registryName = "registry_" + demangle( typeid( Interface ) ); - m.registry = static_cast *>( dlsym( m.handle, registryName.c_str( ) ) ); + m.registry = static_cast< ModuleRegistry< Interface, CtorParams > *>( dlsym( m.handle, registryName.c_str( ) ) ); if( !m.registry ) { dlclose( m.handle ); throw std::runtime_error( "missing module registry" ); @@ -56,7 +55,7 @@ class ModuleLoader { } } - ~ModuleLoader( ) + virtual ~BaseModuleLoader< Interface, CtorParams >( ) { for( typename mapType::iterator it = m_modules.begin( ); it != m_modules.end( ); it++ ) { if( (*it).second.handle ) { @@ -65,11 +64,35 @@ class ModuleLoader { } } } - - Interface *create( std::string subclass ) + + void destroy( Interface *obj ) { - typename mapType::const_iterator it = m_modules.find( subclass ); + std::string clazz = demangle( typeid( *obj ) ); + + typename mapType::const_iterator it = m_modules.find( clazz ); if( it == m_modules.end( ) ) { + throw std::runtime_error( "calling unknown destructor" ); + } + + (*it).second.registry->destroy( obj ); + } +}; + +template< typename Interface, typename CtorParams = NullType > +class ModuleLoader; + +template< typename Interface > +class ModuleLoader< Interface, NullType > : public BaseModuleLoader< Interface, NullType > +{ + public: + + ModuleLoader< Interface >( const std::vector files ) + : BaseModuleLoader< Interface >(files ) { } + + Interface *create( std::string subclass ) + { + typename BaseModuleLoader< Interface >::mapType::const_iterator it = BaseModuleLoader< Interface >::m_modules.find( subclass ); + if( it == BaseModuleLoader< Interface >::m_modules.end( ) ) { throw std::runtime_error( "calling unknown constructor" ); } @@ -77,21 +100,34 @@ class ModuleLoader { std::string clazz = demangle( typeid( *obj ) ); - m_modules.insert( std::make_pair( clazz, (*it).second ) ); + BaseModuleLoader< Interface >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); return obj; } - - void destroy( Interface *obj ) +}; + +template< typename Interface, typename P1 > +class ModuleLoader< Interface, TYPELIST_1( P1 ) > : public BaseModuleLoader< Interface, TYPELIST_1( P1 ) > +{ + public: + + ModuleLoader< Interface, TYPELIST_1( P1 ) >( const std::vector files ) + : BaseModuleLoader< Interface, TYPELIST_1( P1 ) >( files ) { } + + Interface *create( std::string subclass, P1 p1 ) { + typename BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.find( subclass ); + if( it == BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.end( ) ) { + throw std::runtime_error( "calling unknown constructor" ); + } + + Interface *obj = (*it).second.registry->create( p1 ); + std::string clazz = demangle( typeid( *obj ) ); - typename mapType::const_iterator it = m_modules.find( clazz ); - if( it == m_modules.end( ) ) { - throw std::runtime_error( "calling unknown destructor" ); - } + BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); - (*it).second.registry->destroy( obj ); + return obj; } }; diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp index 3b7e5d9..055cb1d 100644 --- a/src/ModuleRegistry.hpp +++ b/src/ModuleRegistry.hpp @@ -8,7 +8,7 @@ template< typename Interface, typename CtorParams = NullType > struct ModuleRegistry; -template< typename Interface > +template< typename Interface> struct ModuleRegistry< Interface > { std::string name; Interface *(*create)( ); diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index e924b16..1c0576f 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -4,10 +4,12 @@ #include "HTMLLinkExtractProcessor.hpp" #include "MemoryURLSeen.hpp" #include "URLNormalizer.hpp" +#include "URLFilter.hpp" #include "ModuleLoader.hpp" #include #include +#include using namespace std; @@ -15,32 +17,42 @@ int main( void ) { FILELog::reportingLevel( ) = logINFO; - vector modules; - modules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); - modules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); - ModuleLoader urlNormalizers( modules ); + vector normalizerModules; + normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); + normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); + ModuleLoader urlNormalizers( normalizerModules ); + + vector filterModules; + filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); + filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); + ModuleLoader ) > urlFilters( filterModules ); + + vector filterChainModules; + filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); + ModuleLoader ) > urlChainFilter( filterChainModules ); Frontier *frontier = new MemoryFrontier( ); Fetcher *fetcher = new LibFetchFetcher( ); Deduper *deduper = new MD5Deduper( ); URLSeen *urlSeen = new MemoryURLSeen( ); -/* - set protocols; + set protocols; protocols.insert( "http" ); protocols.insert( "https" ); - ProtocolURLFilter protocolFilter( protocols ); + URLFilter *protocolFilter = urlFilters.create( "protocol", protocols ); set hosts; hosts.insert( "www.andreasbaumann.cc" ); - HostURLFilter hostFilter( hosts ); + URLFilter *hostFilter = urlFilters.create( "host", hosts ); - ChainURLFilter filters( &protocolFilter, &hostFilter ); -*/ + list filters; + filters.push_back( hostFilter ); + filters.push_back( protocolFilter ); + URLFilter *chainFilter = urlChainFilter.create( "chain", filters ); + URLNormalizer *normalizer = urlNormalizers.create( "google" ); - //Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen ); - Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, std::list( ), urlSeen ); + Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; @@ -64,6 +76,9 @@ int main( void ) delete processor; urlNormalizers.destroy( normalizer ); + urlChainFilter.destroy( chainFilter ); + urlFilters.destroy( protocolFilter ); + urlFilters.destroy( hostFilter ); delete urlSeen; delete deduper; delete fetcher; diff --git a/src/modules/urlfilter/GNUmakefile b/src/modules/urlfilter/GNUmakefile index ea5262d..9909f8d 100644 --- a/src/modules/urlfilter/GNUmakefile +++ b/src/modules/urlfilter/GNUmakefile @@ -1,7 +1,6 @@ TOPDIR = ../../.. -SUBDIRS = protocol host -#chain +SUBDIRS = protocol host chain -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/modules/urlfilter/chain/ChainURLFilter.cpp b/src/modules/urlfilter/chain/ChainURLFilter.cpp index a356299..fc2de93 100644 --- a/src/modules/urlfilter/chain/ChainURLFilter.cpp +++ b/src/modules/urlfilter/chain/ChainURLFilter.cpp @@ -1,30 +1,9 @@ #include "ChainURLFilter.hpp" -ChainURLFilter::ChainURLFilter( ) - : m_filters( ) +ChainURLFilter::ChainURLFilter( const std::list< URLFilter * > filters ) + : m_filters( filters ) { } - -ChainURLFilter::ChainURLFilter( URLFilter *f1 ) - : m_filters( ) -{ - m_filters.push_back( f1 ); -} - -ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2 ) - : m_filters( ) -{ - m_filters.push_back( f1 ); - m_filters.push_back( f2 ); -} - -ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 ) - : m_filters( ) -{ - m_filters.push_back( f1 ); - m_filters.push_back( f2 ); - m_filters.push_back( f3 ); -} bool ChainURLFilter::filter( const URL url ) { @@ -36,3 +15,5 @@ bool ChainURLFilter::filter( const URL url ) return true; } + +REGISTER_MODULE_1( "chain", URLFilter, ChainURLFilter, const std::list ) diff --git a/src/modules/urlfilter/chain/ChainURLFilter.hpp b/src/modules/urlfilter/chain/ChainURLFilter.hpp index 8c6d165..966e5cc 100644 --- a/src/modules/urlfilter/chain/ChainURLFilter.hpp +++ b/src/modules/urlfilter/chain/ChainURLFilter.hpp @@ -9,10 +9,7 @@ class ChainURLFilter : public URLFilter { public: - ChainURLFilter( ); - ChainURLFilter( URLFilter *f1 ); - ChainURLFilter( URLFilter *f1, URLFilter *f2 ); - ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 ); + ChainURLFilter( const std::list< URLFilter * > ); virtual bool filter( const URL url ); @@ -20,7 +17,6 @@ class ChainURLFilter : public URLFilter std::list m_filters; }; -DECLARE_MODULE( URLFilter ) -DECLARE_MODULE_1( URLFilter, URLFilter * ) +DECLARE_MODULE_1( URLFilter, const std::list ) #endif -- cgit v1.2.3-54-g00ecf