diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-08 15:26:45 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-08 15:26:45 +0200 |
commit | d560a4920cdd16904a71640e0cfc380ea34e9e6f (patch) | |
tree | b94acc06cfabedea7146e142fdaffd0663e94d45 | |
parent | 4257babfecb1fcb298b848ef5a918bd9295abb30 (diff) | |
download | crawler-d560a4920cdd16904a71640e0cfc380ea34e9e6f.tar.gz crawler-d560a4920cdd16904a71640e0cfc380ea34e9e6f.tar.bz2 |
chain filter and modules with one ctor param work now
-rw-r--r-- | docs/LINKS | 1 | ||||
-rw-r--r-- | src/GNUmakefile | 4 | ||||
-rw-r--r-- | src/ModuleLoader.hpp | 90 | ||||
-rw-r--r-- | src/ModuleRegistry.hpp | 2 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 39 | ||||
-rw-r--r-- | src/modules/urlfilter/GNUmakefile | 3 | ||||
-rw-r--r-- | src/modules/urlfilter/chain/ChainURLFilter.cpp | 27 | ||||
-rw-r--r-- | src/modules/urlfilter/chain/ChainURLFilter.hpp | 8 |
8 files changed, 101 insertions, 73 deletions
@@ -53,3 +53,4 @@ http://sourceforge.net/projects/toast/: portable type_info.name() http://gcc.gnu.org/onlinedocs/libstdc++/manual/ext_demangling.html http://tombarta.wordpress.com/category/gcc/ ?? name of module or typeid of derived class in module? +http://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Name-lookup.html#Name-lookup diff --git a/src/GNUmakefile b/src/GNUmakefile index 906f3ea..2b7c38e 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -39,8 +39,8 @@ CPP_OBJS = \ LOCAL_STATIC_LIB = \ libcrawlingwolf.a -#CPP_BINS = \ -# crawlingwolf$(EXE) +CPP_BINS = \ + crawlingwolf$(EXE) -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp index 1a89d38..524a0e1 100644 --- a/src/ModuleLoader.hpp +++ b/src/ModuleLoader.hpp @@ -14,29 +14,28 @@ #include "TypeList.hpp" #include "TypeInfo.hpp" -template< typename Interface > +template< typename Interface, typename CtorParams = NullType > struct Module { void *handle; - ModuleRegistry<Interface> *registry; + ModuleRegistry< Interface, CtorParams > *registry; }; -template< typename Interface > -class ModuleLoader { +template< typename Interface, typename CtorParams = NullType > +class BaseModuleLoader { + + public: - typedef typename std::map<std::string, Module< Interface > > mapType; + typedef typename std::map<std::string, Module< Interface, CtorParams > > mapType; - protected: + protected: + mapType m_modules; - + public: - - ModuleLoader<Interface>( ) - { - } - - ModuleLoader<Interface>( const std::vector<std::string> files ) + + BaseModuleLoader( const std::vector<std::string> files ) { - Module<Interface> m; + Module< Interface, CtorParams> m; for( std::vector<string>::const_iterator it = files.begin( ); it != files.end( ); it++ ) { m.handle = dlopen( it->c_str( ), RTLD_NOW ); @@ -46,7 +45,7 @@ class ModuleLoader { std::string registryName = "registry_" + demangle( typeid( Interface ) ); - m.registry = static_cast<ModuleRegistry<Interface> *>( dlsym( m.handle, registryName.c_str( ) ) ); + m.registry = static_cast< ModuleRegistry< Interface, CtorParams > *>( dlsym( m.handle, registryName.c_str( ) ) ); if( !m.registry ) { dlclose( m.handle ); throw std::runtime_error( "missing module registry" ); @@ -56,7 +55,7 @@ class ModuleLoader { } } - ~ModuleLoader<Interface>( ) + virtual ~BaseModuleLoader< Interface, CtorParams >( ) { for( typename mapType::iterator it = m_modules.begin( ); it != m_modules.end( ); it++ ) { if( (*it).second.handle ) { @@ -65,11 +64,35 @@ class ModuleLoader { } } } - - Interface *create( std::string subclass ) + + void destroy( Interface *obj ) { - typename mapType::const_iterator it = m_modules.find( subclass ); + std::string clazz = demangle( typeid( *obj ) ); + + typename mapType::const_iterator it = m_modules.find( clazz ); if( it == m_modules.end( ) ) { + throw std::runtime_error( "calling unknown destructor" ); + } + + (*it).second.registry->destroy( obj ); + } +}; + +template< typename Interface, typename CtorParams = NullType > +class ModuleLoader; + +template< typename Interface > +class ModuleLoader< Interface, NullType > : public BaseModuleLoader< Interface, NullType > +{ + public: + + ModuleLoader< Interface >( const std::vector<std::string> files ) + : BaseModuleLoader< Interface >(files ) { } + + Interface *create( std::string subclass ) + { + typename BaseModuleLoader< Interface >::mapType::const_iterator it = BaseModuleLoader< Interface >::m_modules.find( subclass ); + if( it == BaseModuleLoader< Interface >::m_modules.end( ) ) { throw std::runtime_error( "calling unknown constructor" ); } @@ -77,21 +100,34 @@ class ModuleLoader { std::string clazz = demangle( typeid( *obj ) ); - m_modules.insert( std::make_pair( clazz, (*it).second ) ); + BaseModuleLoader< Interface >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); return obj; } - - void destroy( Interface *obj ) +}; + +template< typename Interface, typename P1 > +class ModuleLoader< Interface, TYPELIST_1( P1 ) > : public BaseModuleLoader< Interface, TYPELIST_1( P1 ) > +{ + public: + + ModuleLoader< Interface, TYPELIST_1( P1 ) >( const std::vector<std::string> files ) + : BaseModuleLoader< Interface, TYPELIST_1( P1 ) >( files ) { } + + Interface *create( std::string subclass, P1 p1 ) { + typename BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.find( subclass ); + if( it == BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.end( ) ) { + throw std::runtime_error( "calling unknown constructor" ); + } + + Interface *obj = (*it).second.registry->create( p1 ); + std::string clazz = demangle( typeid( *obj ) ); - typename mapType::const_iterator it = m_modules.find( clazz ); - if( it == m_modules.end( ) ) { - throw std::runtime_error( "calling unknown destructor" ); - } + BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); - (*it).second.registry->destroy( obj ); + return obj; } }; diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp index 3b7e5d9..055cb1d 100644 --- a/src/ModuleRegistry.hpp +++ b/src/ModuleRegistry.hpp @@ -8,7 +8,7 @@ template< typename Interface, typename CtorParams = NullType > struct ModuleRegistry; -template< typename Interface > +template< typename Interface> struct ModuleRegistry< Interface > { std::string name; Interface *(*create)( ); diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index e924b16..1c0576f 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -4,10 +4,12 @@ #include "HTMLLinkExtractProcessor.hpp" #include "MemoryURLSeen.hpp" #include "URLNormalizer.hpp" +#include "URLFilter.hpp" #include "ModuleLoader.hpp" #include <set> #include <vector> +#include <list> using namespace std; @@ -15,32 +17,42 @@ int main( void ) { FILELog::reportingLevel( ) = logINFO; - vector<string> modules; - modules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); - modules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); - ModuleLoader<URLNormalizer> urlNormalizers( modules ); + vector<string> normalizerModules; + normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); + normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); + ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules ); + + vector<string> filterModules; + filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); + filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); + ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules ); + + vector<string> filterChainModules; + filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); + ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules ); Frontier *frontier = new MemoryFrontier( ); Fetcher *fetcher = new LibFetchFetcher( ); Deduper *deduper = new MD5Deduper( ); URLSeen *urlSeen = new MemoryURLSeen( ); -/* - set<string> protocols; + set<string> protocols; protocols.insert( "http" ); protocols.insert( "https" ); - ProtocolURLFilter protocolFilter( protocols ); + URLFilter *protocolFilter = urlFilters.create( "protocol", protocols ); set<string> hosts; hosts.insert( "www.andreasbaumann.cc" ); - HostURLFilter hostFilter( hosts ); + URLFilter *hostFilter = urlFilters.create( "host", hosts ); - ChainURLFilter filters( &protocolFilter, &hostFilter ); -*/ + list<URLFilter *> filters; + filters.push_back( hostFilter ); + filters.push_back( protocolFilter ); + URLFilter *chainFilter = urlChainFilter.create( "chain", filters ); + URLNormalizer *normalizer = urlNormalizers.create( "google" ); - //Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen ); - Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, std::list( ), urlSeen ); + Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; @@ -64,6 +76,9 @@ int main( void ) delete processor; urlNormalizers.destroy( normalizer ); + urlChainFilter.destroy( chainFilter ); + urlFilters.destroy( protocolFilter ); + urlFilters.destroy( hostFilter ); delete urlSeen; delete deduper; delete fetcher; diff --git a/src/modules/urlfilter/GNUmakefile b/src/modules/urlfilter/GNUmakefile index ea5262d..9909f8d 100644 --- a/src/modules/urlfilter/GNUmakefile +++ b/src/modules/urlfilter/GNUmakefile @@ -1,7 +1,6 @@ TOPDIR = ../../.. -SUBDIRS = protocol host -#chain +SUBDIRS = protocol host chain -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/modules/urlfilter/chain/ChainURLFilter.cpp b/src/modules/urlfilter/chain/ChainURLFilter.cpp index a356299..fc2de93 100644 --- a/src/modules/urlfilter/chain/ChainURLFilter.cpp +++ b/src/modules/urlfilter/chain/ChainURLFilter.cpp @@ -1,30 +1,9 @@ #include "ChainURLFilter.hpp" -ChainURLFilter::ChainURLFilter( ) - : m_filters( ) +ChainURLFilter::ChainURLFilter( const std::list< URLFilter * > filters ) + : m_filters( filters ) { } - -ChainURLFilter::ChainURLFilter( URLFilter *f1 ) - : m_filters( ) -{ - m_filters.push_back( f1 ); -} - -ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2 ) - : m_filters( ) -{ - m_filters.push_back( f1 ); - m_filters.push_back( f2 ); -} - -ChainURLFilter::ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 ) - : m_filters( ) -{ - m_filters.push_back( f1 ); - m_filters.push_back( f2 ); - m_filters.push_back( f3 ); -} bool ChainURLFilter::filter( const URL url ) { @@ -36,3 +15,5 @@ bool ChainURLFilter::filter( const URL url ) return true; } + +REGISTER_MODULE_1( "chain", URLFilter, ChainURLFilter, const std::list<URLFilter *> ) diff --git a/src/modules/urlfilter/chain/ChainURLFilter.hpp b/src/modules/urlfilter/chain/ChainURLFilter.hpp index 8c6d165..966e5cc 100644 --- a/src/modules/urlfilter/chain/ChainURLFilter.hpp +++ b/src/modules/urlfilter/chain/ChainURLFilter.hpp @@ -9,10 +9,7 @@ class ChainURLFilter : public URLFilter { public: - ChainURLFilter( ); - ChainURLFilter( URLFilter *f1 ); - ChainURLFilter( URLFilter *f1, URLFilter *f2 ); - ChainURLFilter( URLFilter *f1, URLFilter *f2, URLFilter *f3 ); + ChainURLFilter( const std::list< URLFilter * > ); virtual bool filter( const URL url ); @@ -20,7 +17,6 @@ class ChainURLFilter : public URLFilter std::list<URLFilter *> m_filters; }; -DECLARE_MODULE( URLFilter ) -DECLARE_MODULE_1( URLFilter, URLFilter * ) +DECLARE_MODULE_1( URLFilter, const std::list<URLFilter *> ) #endif |