diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-08 17:27:06 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-08 17:27:06 +0200 |
commit | 5e1a0dec4672f70d10b82364274353962d2d26f6 (patch) | |
tree | 5b4b9d504bac7d55f29f2e6eaad8b53f3d3eff05 | |
parent | d560a4920cdd16904a71640e0cfc380ea34e9e6f (diff) | |
download | crawler-5e1a0dec4672f70d10b82364274353962d2d26f6.tar.gz crawler-5e1a0dec4672f70d10b82364274353962d2d26f6.tar.bz2 |
modularized all other modules
29 files changed, 579 insertions, 61 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile index 2b7c38e..8f1657e 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -9,12 +9,8 @@ INCLUDE_CPPFLAGS = \ INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ - -I$(TOPDIR)/libfetch \ - -I$(TOPDIR)/streamhtmlparser INCLUDE_LIBS = \ - $(TOPDIR)/libfetch/libfetch.a \ - $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a # openssl ifeq ($(WITH_SSL),1) @@ -27,11 +23,7 @@ INCLUDE_LIBS += \ endif LOCAL_STATIC_LIB_OBJS = \ - URL.o \ - LibFetchFetcher.o \ - LibFetchRewindInputStream.o \ - HTMLLinkExtractProcessor.o \ - MemoryURLSeen.o + URL.o CPP_OBJS = \ $(LOCAL_STATIC_LIB_OBJS) diff --git a/src/MD5Deduper.hpp b/src/MD5Deduper.hpp deleted file mode 100644 index 8372865..0000000 --- a/src/MD5Deduper.hpp +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __MD5DEDUPER_H -#define __MD5DEDUPER_H - -#include "Deduper.hpp" - -class MD5Deduper : public Deduper { - public: - MD5Deduper( ) { - } - - virtual ~MD5Deduper( ) { - } - - virtual bool contentSeen( const URL url, RewindInputStream *s ) { - (void)url; - (void)s; - return false; - } -}; - -#endif diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp index 524a0e1..8ecccc3 100644 --- a/src/ModuleLoader.hpp +++ b/src/ModuleLoader.hpp @@ -81,6 +81,8 @@ class BaseModuleLoader { template< typename Interface, typename CtorParams = NullType > class ModuleLoader; +// no param + template< typename Interface > class ModuleLoader< Interface, NullType > : public BaseModuleLoader< Interface, NullType > { @@ -106,29 +108,113 @@ class ModuleLoader< Interface, NullType > : public BaseModuleLoader< Interface, } }; -template< typename Interface, typename P1 > -class ModuleLoader< Interface, TYPELIST_1( P1 ) > : public BaseModuleLoader< Interface, TYPELIST_1( P1 ) > +// one param + +template< typename Interface, typename T1 > +class ModuleLoader< Interface, TYPELIST_1( T1 ) > : public BaseModuleLoader< Interface, TYPELIST_1( T1 ) > +{ + public: + + ModuleLoader< Interface, TYPELIST_1( T1 ) >( const std::vector<std::string> files ) + : BaseModuleLoader< Interface, TYPELIST_1( T1 ) >( files ) { } + + Interface *create( std::string subclass, T1 t1 ) + { + typename BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.find( subclass ); + if( it == BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.end( ) ) { + throw std::runtime_error( "calling unknown constructor" ); + } + + Interface *obj = (*it).second.registry->create( t1 ); + + std::string clazz = demangle( typeid( *obj ) ); + + BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); + + return obj; + } +}; + +// two params + +template< typename Interface, typename T1, typename T2 > +class ModuleLoader< Interface, TYPELIST_2( T1, T2 ) > : public BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) > +{ + public: + + ModuleLoader< Interface, TYPELIST_2( T1, T2 ) >( const std::vector<std::string> files ) + : BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >( files ) { } + + Interface *create( std::string subclass, T1 t1, T2 t2 ) + { + typename BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.find( subclass ); + if( it == BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.end( ) ) { + throw std::runtime_error( "calling unknown constructor" ); + } + + Interface *obj = (*it).second.registry->create( t1, t2 ); + + std::string clazz = demangle( typeid( *obj ) ); + + BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); + + return obj; + } +}; + +// three params + +template< typename Interface, typename T1, typename T2, typename T3 > +class ModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) > : public BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) > { public: - ModuleLoader< Interface, TYPELIST_1( P1 ) >( const std::vector<std::string> files ) - : BaseModuleLoader< Interface, TYPELIST_1( P1 ) >( files ) { } + ModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >( const std::vector<std::string> files ) + : BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >( files ) { } - Interface *create( std::string subclass, P1 p1 ) + Interface *create( std::string subclass, T1 t1, T2 t2, T3 t3 ) { - typename BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.find( subclass ); - if( it == BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.end( ) ) { + typename BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.find( subclass ); + if( it == BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.end( ) ) { throw std::runtime_error( "calling unknown constructor" ); } - Interface *obj = (*it).second.registry->create( p1 ); + Interface *obj = (*it).second.registry->create( t1, t2, t3 ); std::string clazz = demangle( typeid( *obj ) ); - BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); + BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); return obj; } }; +// four params + +template< typename Interface, typename T1, typename T2, typename T3, typename T4 > +class ModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) > : public BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) > +{ + public: + + ModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >( const std::vector<std::string> files ) + : BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >( files ) { } + + Interface *create( std::string subclass, T1 t1, T2 t2, T3 t3, T4 t4 ) + { + typename BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.find( subclass ); + if( it == BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.end( ) ) { + throw std::runtime_error( "calling unknown constructor" ); + } + + Interface *obj = (*it).second.registry->create( t1, t2, t3, t4 ); + + std::string clazz = demangle( typeid( *obj ) ); + + BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) ); + + return obj; + } +}; + + #endif diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp index 055cb1d..2b9f441 100644 --- a/src/ModuleRegistry.hpp +++ b/src/ModuleRegistry.hpp @@ -33,9 +33,50 @@ struct ModuleRegistry< Interface, TYPELIST_1( P1 ) > { { } }; + +template< typename Interface, typename P1, typename P2 > +struct ModuleRegistry< Interface, TYPELIST_2( P1, P2 ) > { + std::string name; + Interface *(*create)( P1, P2 ); + void (*destroy)( Interface *obj ); + + ModuleRegistry( std::string _name, Interface *(*_create)( P1, P2 ), + void (*_destroy)( Interface *obj ) ) + : name( _name ), create( _create ), destroy( _destroy ) + { + } +}; + +template< typename Interface, typename P1, typename P2, typename P3 > +struct ModuleRegistry< Interface, TYPELIST_3( P1, P2, P3 ) > { + std::string name; + Interface *(*create)( P1, P2, P3 ); + void (*destroy)( Interface *obj ); + + ModuleRegistry( std::string _name, Interface *(*_create)( P1, P2, P3 ), + void (*_destroy)( Interface *obj ) ) + : name( _name ), create( _create ), destroy( _destroy ) + { + } +}; + +template< typename Interface, typename P1, typename P2, typename P3, typename P4 > +struct ModuleRegistry< Interface, TYPELIST_4( P1, P2, P3, P4 ) > { + std::string name; + Interface *(*create)( P1, P2, P3, P4 ); + void (*destroy)( Interface *obj ); + + ModuleRegistry( std::string _name, Interface *(*_create)( P1, P2, P3, P4 ), + void (*_destroy)( Interface *obj ) ) + : name( _name ), create( _create ), destroy( _destroy ) + { + } +}; #ifdef SHARED +// no param macro + #define DECLARE_MODULE( baseClass ) \ extern ModuleRegistry<baseClass> registry ## _ ## baseClass; @@ -52,13 +93,15 @@ static void destroy( baseClass *obj ) \ \ ModuleRegistry<baseClass> registry ## _ ## baseClass( name, &create, &destroy ); +// 1 param macro + #define DECLARE_MODULE_1( baseClass, T1 ) \ extern ModuleRegistry<baseClass, TYPELIST_1( T1 ) > registry ## _ ## baseClass; #define REGISTER_MODULE_1( name, baseClass, subClass, T1 ) \ -static baseClass *create( T1 t ) \ +static baseClass *create( T1 t1 ) \ { \ - return new subClass( t ); \ + return new subClass( t1 ); \ } \ \ static void destroy( baseClass *obj ) \ @@ -68,13 +111,73 @@ static void destroy( baseClass *obj ) \ \ ModuleRegistry<baseClass, TYPELIST_1( T1 )> registry ## _ ## baseClass( name, &create, &destroy ); +// 2 param macro + +#define DECLARE_MODULE_2( baseClass, T1, T2 ) \ + extern ModuleRegistry<baseClass, TYPELIST_2( T1, T2 ) > registry ## _ ## baseClass; + +#define REGISTER_MODULE_2( name, baseClass, subClass, T1, T2 ) \ +static baseClass *create( T1 t1, T2 t2 ) \ +{ \ + return new subClass( t1, t2 ); \ +} \ + \ +static void destroy( baseClass *obj ) \ +{ \ + delete obj; \ +} \ + \ +ModuleRegistry<baseClass, TYPELIST_2( T1, T2 )> registry ## _ ## baseClass( name, &create, &destroy ); + +// 3 param macro + +#define DECLARE_MODULE_3( baseClass, T1, T2, T3 ) \ + extern ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 ) > registry ## _ ## baseClass; + +#define REGISTER_MODULE_3( name, baseClass, subClass, T1, T2, T3 ) \ +static baseClass *create( T1 t1, T2 t2, T3 t3 ) \ +{ \ + return new subClass( t1, t2, t3 ); \ +} \ + \ +static void destroy( baseClass *obj ) \ +{ \ + delete obj; \ +} \ + \ +ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 )> registry ## _ ## baseClass( name, &create, &destroy ); + +// 4 param macro + +#define DECLARE_MODULE_4( baseClass, T1, T2, T3, T4 ) \ + extern ModuleRegistry<baseClass, TYPELIST_4( T1, T2, T3, T4 ) > registry ## _ ## baseClass; + +#define REGISTER_MODULE_4( name, baseClass, subClass, T1, T2, T3, T4 ) \ +static baseClass *create( T1 t1, T2 t2, T3 t3, T4 t4 ) \ +{ \ + return new subClass( t1, t2, t3, t4 ); \ +} \ + \ +static void destroy( baseClass *obj ) \ +{ \ + delete obj; \ +} \ + \ +ModuleRegistry<baseClass, TYPELIST_4( T1, T2, T3, T4 )> registry ## _ ## baseClass( name, &create, &destroy ); + #else // SHARED #define DECLARE_MODULE( baseClass ) -#define DECLARE_MODULE_1( baseClass, T ) +#define DECLARE_MODULE_1( baseClass, T1 ) +#define DECLARE_MODULE_2( baseClass, T1, T2 ) +#define DECLARE_MODULE_3( baseClass, T1, T2, T3 ) +#define DECLARE_MODULE_4( baseClass, T1, T2, T3, T4 ) #define REGISTER_MODULE( name, baseClass, subClass ) -#define REGISTER_MODULE_1( name, baseClass, subClass, T ) +#define REGISTER_MODULE_1( name, baseClass, subClass, T1 ) +#define REGISTER_MODULE_2( name, baseClass, subClass, T1, T2 ) +#define REGISTER_MODULE_3( name, baseClass, subClass, T1, T2, T3 ) +#define REGISTER_MODULE_4( name, baseClass, subClass, T1, T2, T3, T4 ) #endif // SHARED diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index 1c0576f..0fb8697 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -1,12 +1,15 @@ -#include "LibFetchFetcher.hpp" -#include "MemoryFrontier.hpp" -#include "MD5Deduper.hpp" -#include "HTMLLinkExtractProcessor.hpp" -#include "MemoryURLSeen.hpp" +#include "Fetcher.hpp" +#include "Frontier.hpp" +#include "Deduper.hpp" +#include "Processor.hpp" +#include "URLSeen.hpp" #include "URLNormalizer.hpp" #include "URLFilter.hpp" + #include "ModuleLoader.hpp" +#include "Logger.hpp" + #include <set> #include <vector> #include <list> @@ -31,10 +34,30 @@ int main( void ) filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules ); - Frontier *frontier = new MemoryFrontier( ); - Fetcher *fetcher = new LibFetchFetcher( ); - Deduper *deduper = new MD5Deduper( ); - URLSeen *urlSeen = new MemoryURLSeen( ); + vector<string> frontierModules; + frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" ); + ModuleLoader<Frontier> frontiers( frontierModules ); + + vector<string> fetcherModules; + fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); + ModuleLoader<Fetcher> fetchers( fetcherModules ); + + vector<string> urlseenModules; + urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" ); + ModuleLoader<URLSeen> urlSeens( urlseenModules ); + + vector<string> deduperModules; + deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" ); + ModuleLoader<Deduper> dedupers( deduperModules ); + + vector<string> processorModules; + processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); + ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules ); + + Frontier *frontier = frontiers.create( "memory" ); + Fetcher *fetcher = fetchers.create( "libfetch" ); + Deduper *deduper = dedupers.create( "null" ); + URLSeen *urlSeen = urlSeens.create( "memory" ); set<string> protocols; protocols.insert( "http" ); @@ -52,7 +75,8 @@ int main( void ) URLNormalizer *normalizer = urlNormalizers.create( "google" ); - Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, chainFilter, urlSeen ); + Processor *processor = processors.create( "htmllinkextract", + normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; @@ -79,10 +103,10 @@ int main( void ) urlChainFilter.destroy( chainFilter ); urlFilters.destroy( protocolFilter ); urlFilters.destroy( hostFilter ); - delete urlSeen; - delete deduper; - delete fetcher; - delete frontier; + urlSeens.destroy( urlSeen ); + dedupers.destroy( deduper ); + fetchers.destroy( fetcher ); + frontiers.destroy( frontier ); LOG( logNOTICE ) << "Crawler stopped.."; diff --git a/src/modules/GNUmakefile b/src/modules/GNUmakefile index cd45705..31dc26c 100644 --- a/src/modules/GNUmakefile +++ b/src/modules/GNUmakefile @@ -1,6 +1,6 @@ TOPDIR = ../.. -SUBDIRS = urlnormalizer urlfilter +SUBDIRS = urlnormalizer urlfilter frontier fetcher urlseen deduper processor -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/src/modules/deduper/GNUmakefile b/src/modules/deduper/GNUmakefile new file mode 100644 index 0000000..1a63a91 --- /dev/null +++ b/src/modules/deduper/GNUmakefile @@ -0,0 +1,17 @@ +TOPDIR = ../../.. + +SUBDIRS = null + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: diff --git a/src/modules/deduper/null/GNUmakefile b/src/modules/deduper/null/GNUmakefile new file mode 100644 index 0000000..ae9663f --- /dev/null +++ b/src/modules/deduper/null/GNUmakefile @@ -0,0 +1,39 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a + +DYNAMIC_MODULE = \ + mod_deduper_null.so + +STATIC_LIB = \ + libnulldeduper.a + +CPP_OBJS = \ + NullDeduper.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/modules/deduper/null/NullDeduper.cpp b/src/modules/deduper/null/NullDeduper.cpp new file mode 100644 index 0000000..9eca5c4 --- /dev/null +++ b/src/modules/deduper/null/NullDeduper.cpp @@ -0,0 +1,3 @@ +#include "NullDeduper.hpp" + +REGISTER_MODULE( "null", Deduper, NullDeduper ) diff --git a/src/modules/deduper/null/NullDeduper.hpp b/src/modules/deduper/null/NullDeduper.hpp new file mode 100644 index 0000000..90de4dd --- /dev/null +++ b/src/modules/deduper/null/NullDeduper.hpp @@ -0,0 +1,24 @@ +#ifndef __NULLDEDUPER_H +#define __NULLDEDUPER_H + +#include "Deduper.hpp" +#include "ModuleRegistry.hpp" + +class NullDeduper : public Deduper { + public: + NullDeduper( ) { + } + + virtual ~NullDeduper( ) { + } + + virtual bool contentSeen( const URL url, RewindInputStream *s ) { + (void)url; + (void)s; + return false; + } +}; + +DECLARE_MODULE( Deduper ) + +#endif diff --git a/src/modules/fetcher/GNUmakefile b/src/modules/fetcher/GNUmakefile new file mode 100644 index 0000000..526e9e5 --- /dev/null +++ b/src/modules/fetcher/GNUmakefile @@ -0,0 +1,17 @@ +TOPDIR = ../../.. + +SUBDIRS = libfetch + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: diff --git a/src/modules/fetcher/libfetch/GNUmakefile b/src/modules/fetcher/libfetch/GNUmakefile new file mode 100644 index 0000000..707a4c0 --- /dev/null +++ b/src/modules/fetcher/libfetch/GNUmakefile @@ -0,0 +1,42 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src \ + -I$(TOPDIR)/libfetch + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a \ + $(TOPDIR)/libfetch/libfetch.a + +DYNAMIC_MODULE = \ + mod_fetcher_libfetch.so + +STATIC_LIB = \ + liblibfetchfetcher.a + +CPP_OBJS = \ + LibFetchFetcher.o \ + LibFetchRewindInputStream.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/LibFetchFetcher.cpp b/src/modules/fetcher/libfetch/LibFetchFetcher.cpp index 976543f..5b770a7 100644 --- a/src/LibFetchFetcher.cpp +++ b/src/modules/fetcher/libfetch/LibFetchFetcher.cpp @@ -8,3 +8,5 @@ RewindInputStream *LibFetchFetcher::fetch( const URL url ) LibFetchRewindInputStream *s = new LibFetchRewindInputStream( url ); return s; } + +REGISTER_MODULE( "libfetch", Fetcher, LibFetchFetcher ) diff --git a/src/LibFetchFetcher.hpp b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp index 5cb4677..1103612 100644 --- a/src/LibFetchFetcher.hpp +++ b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp @@ -2,6 +2,7 @@ #define __LIBFETCH_FETCHER_H #include "Fetcher.hpp" +#include "ModuleRegistry.hpp" class LibFetchFetcher : public Fetcher { @@ -15,4 +16,6 @@ class LibFetchFetcher : public Fetcher virtual RewindInputStream *fetch( const URL url ); }; +DECLARE_MODULE( Fetcher ) + #endif diff --git a/src/LibFetchRewindInputStream.cpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp index 4e837c8..4e837c8 100644 --- a/src/LibFetchRewindInputStream.cpp +++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp diff --git a/src/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp index f1896df..f1896df 100644 --- a/src/LibFetchRewindInputStream.hpp +++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp diff --git a/src/modules/frontier/GNUmakefile b/src/modules/frontier/GNUmakefile new file mode 100644 index 0000000..b7b54ae --- /dev/null +++ b/src/modules/frontier/GNUmakefile @@ -0,0 +1,17 @@ +TOPDIR = ../../.. + +SUBDIRS = memory + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: diff --git a/src/modules/frontier/memory/GNUmakefile b/src/modules/frontier/memory/GNUmakefile new file mode 100644 index 0000000..0d81f07 --- /dev/null +++ b/src/modules/frontier/memory/GNUmakefile @@ -0,0 +1,39 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a + +DYNAMIC_MODULE = \ + mod_frontier_memory.so + +STATIC_LIB = \ + libmemoryfrontier.a + +CPP_OBJS = \ + MemoryFrontier.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/modules/frontier/memory/MemoryFrontier.cpp b/src/modules/frontier/memory/MemoryFrontier.cpp new file mode 100644 index 0000000..ada78dd --- /dev/null +++ b/src/modules/frontier/memory/MemoryFrontier.cpp @@ -0,0 +1,3 @@ +#include "MemoryFrontier.hpp" + +REGISTER_MODULE( "memory", Frontier, MemoryFrontier ) diff --git a/src/MemoryFrontier.hpp b/src/modules/frontier/memory/MemoryFrontier.hpp index 68f1906..d488d37 100644 --- a/src/MemoryFrontier.hpp +++ b/src/modules/frontier/memory/MemoryFrontier.hpp @@ -2,13 +2,15 @@ #define __MEMORY_FRONTIER_H #include "Frontier.hpp" +#include "ModuleRegistry.hpp" #include "Logger.hpp" #include <queue> class MemoryFrontier : public Frontier { public: - virtual ~MemoryFrontier( ) { + virtual ~MemoryFrontier( ) + { } URL getNextUrl( ) { @@ -29,4 +31,6 @@ class MemoryFrontier : public Frontier { queue<URL> m_urls; }; +DECLARE_MODULE( Frontier ) + #endif diff --git a/src/modules/processor/GNUmakefile b/src/modules/processor/GNUmakefile new file mode 100644 index 0000000..8bfd814 --- /dev/null +++ b/src/modules/processor/GNUmakefile @@ -0,0 +1,17 @@ +TOPDIR = ../../.. + +SUBDIRS = htmllinkextract + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: diff --git a/src/modules/processor/htmllinkextract/GNUmakefile b/src/modules/processor/htmllinkextract/GNUmakefile new file mode 100644 index 0000000..b32a980 --- /dev/null +++ b/src/modules/processor/htmllinkextract/GNUmakefile @@ -0,0 +1,41 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src \ + -I$(TOPDIR)/streamhtmlparser + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a \ + $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a + +DYNAMIC_MODULE = \ + mod_processor_htmllinkextract.so + +STATIC_LIB = \ + libhtmllinkextractprocessor.a + +CPP_OBJS = \ + HTMLLinkExtractProcessor.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp index 8479ede..78e7b31 100644 --- a/src/HTMLLinkExtractProcessor.cpp +++ b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp @@ -67,3 +67,5 @@ void HTMLLinkExtractProcessor::process( RewindInputStream *s ) m_parser.Reset( ); } + +REGISTER_MODULE_4( "htmllinkextract", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.hpp index 04b0600..d5c6bc6 100644 --- a/src/HTMLLinkExtractProcessor.hpp +++ b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.hpp @@ -6,6 +6,7 @@ #include "Frontier.hpp" #include "URLFilter.hpp" #include "URLSeen.hpp" +#include "ModuleRegistry.hpp" #include "htmlparser_cpp.h" @@ -24,4 +25,6 @@ class HTMLLinkExtractProcessor : public Processor { URL m_baseUrl; }; +DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) + #endif diff --git a/src/modules/urlseen/GNUmakefile b/src/modules/urlseen/GNUmakefile new file mode 100644 index 0000000..b7b54ae --- /dev/null +++ b/src/modules/urlseen/GNUmakefile @@ -0,0 +1,17 @@ +TOPDIR = ../../.. + +SUBDIRS = memory + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: diff --git a/src/modules/urlseen/memory/GNUmakefile b/src/modules/urlseen/memory/GNUmakefile new file mode 100644 index 0000000..73395f8 --- /dev/null +++ b/src/modules/urlseen/memory/GNUmakefile @@ -0,0 +1,39 @@ +TOPDIR = ../../../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_DIRS = \ + -I. -I$(TOPDIR)/src + +INCLUDE_CXXFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)/src/libcrawlingwolf.a + +DYNAMIC_MODULE = \ + mod_urlseen_memory.so + +STATIC_LIB = \ + libmemoryurlseen.a + +CPP_OBJS = \ + MemoryURLSeen.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: + diff --git a/src/MemoryURLSeen.cpp b/src/modules/urlseen/memory/MemoryURLSeen.cpp index 3d14aa2..15149e9 100644 --- a/src/MemoryURLSeen.cpp +++ b/src/modules/urlseen/memory/MemoryURLSeen.cpp @@ -20,3 +20,5 @@ bool MemoryURLSeen::seen( const URL url ) return hasSeen; } + +REGISTER_MODULE( "memory", URLSeen, MemoryURLSeen ) diff --git a/src/MemoryURLSeen.hpp b/src/modules/urlseen/memory/MemoryURLSeen.hpp index 6e6ccbd..35dcc4f 100644 --- a/src/MemoryURLSeen.hpp +++ b/src/modules/urlseen/memory/MemoryURLSeen.hpp @@ -2,6 +2,7 @@ #define __MEMORY_URLSEEN_H #include "URLSeen.hpp" +#include "ModuleRegistry.hpp" #include <set> @@ -15,4 +16,6 @@ class MemoryURLSeen : public URLSeen { set<URL> m_urls; }; +DECLARE_MODULE( URLSeen ) + #endif diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile index 6ca1f96..6a9104a 100644 --- a/tests/url/GNUmakefile +++ b/tests/url/GNUmakefile @@ -2,8 +2,8 @@ TOPDIR = ../.. SUBDIRS = -INCLUDE_CXXFLAGS = \ - -DUSE_MODULELOADER +#INCLUDE_CXXFLAGS = \ +# -DUSE_MODULELOADER INCLUDE_DIRS = \ -I$(TOPDIR)/src \ |