summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-08 17:27:06 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-08 17:27:06 +0200
commit5e1a0dec4672f70d10b82364274353962d2d26f6 (patch)
tree5b4b9d504bac7d55f29f2e6eaad8b53f3d3eff05
parentd560a4920cdd16904a71640e0cfc380ea34e9e6f (diff)
downloadcrawler-5e1a0dec4672f70d10b82364274353962d2d26f6.tar.gz
crawler-5e1a0dec4672f70d10b82364274353962d2d26f6.tar.bz2
modularized all other modules
-rw-r--r--src/GNUmakefile10
-rw-r--r--src/MD5Deduper.hpp21
-rw-r--r--src/ModuleLoader.hpp104
-rw-r--r--src/ModuleRegistry.hpp111
-rw-r--r--src/crawlingwolf.cpp52
-rw-r--r--src/modules/GNUmakefile2
-rw-r--r--src/modules/deduper/GNUmakefile17
-rw-r--r--src/modules/deduper/null/GNUmakefile39
-rw-r--r--src/modules/deduper/null/NullDeduper.cpp3
-rw-r--r--src/modules/deduper/null/NullDeduper.hpp24
-rw-r--r--src/modules/fetcher/GNUmakefile17
-rw-r--r--src/modules/fetcher/libfetch/GNUmakefile42
-rw-r--r--src/modules/fetcher/libfetch/LibFetchFetcher.cpp (renamed from src/LibFetchFetcher.cpp)2
-rw-r--r--src/modules/fetcher/libfetch/LibFetchFetcher.hpp (renamed from src/LibFetchFetcher.hpp)3
-rw-r--r--src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp (renamed from src/LibFetchRewindInputStream.cpp)0
-rw-r--r--src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp (renamed from src/LibFetchRewindInputStream.hpp)0
-rw-r--r--src/modules/frontier/GNUmakefile17
-rw-r--r--src/modules/frontier/memory/GNUmakefile39
-rw-r--r--src/modules/frontier/memory/MemoryFrontier.cpp3
-rw-r--r--src/modules/frontier/memory/MemoryFrontier.hpp (renamed from src/MemoryFrontier.hpp)6
-rw-r--r--src/modules/processor/GNUmakefile17
-rw-r--r--src/modules/processor/htmllinkextract/GNUmakefile41
-rw-r--r--src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp (renamed from src/HTMLLinkExtractProcessor.cpp)2
-rw-r--r--src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.hpp (renamed from src/HTMLLinkExtractProcessor.hpp)3
-rw-r--r--src/modules/urlseen/GNUmakefile17
-rw-r--r--src/modules/urlseen/memory/GNUmakefile39
-rw-r--r--src/modules/urlseen/memory/MemoryURLSeen.cpp (renamed from src/MemoryURLSeen.cpp)2
-rw-r--r--src/modules/urlseen/memory/MemoryURLSeen.hpp (renamed from src/MemoryURLSeen.hpp)3
-rw-r--r--tests/url/GNUmakefile4
29 files changed, 579 insertions, 61 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 2b7c38e..8f1657e 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -9,12 +9,8 @@ INCLUDE_CPPFLAGS = \
INCLUDE_LDFLAGS = \
INCLUDE_DIRS = \
- -I$(TOPDIR)/libfetch \
- -I$(TOPDIR)/streamhtmlparser
INCLUDE_LIBS = \
- $(TOPDIR)/libfetch/libfetch.a \
- $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a
# openssl
ifeq ($(WITH_SSL),1)
@@ -27,11 +23,7 @@ INCLUDE_LIBS += \
endif
LOCAL_STATIC_LIB_OBJS = \
- URL.o \
- LibFetchFetcher.o \
- LibFetchRewindInputStream.o \
- HTMLLinkExtractProcessor.o \
- MemoryURLSeen.o
+ URL.o
CPP_OBJS = \
$(LOCAL_STATIC_LIB_OBJS)
diff --git a/src/MD5Deduper.hpp b/src/MD5Deduper.hpp
deleted file mode 100644
index 8372865..0000000
--- a/src/MD5Deduper.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __MD5DEDUPER_H
-#define __MD5DEDUPER_H
-
-#include "Deduper.hpp"
-
-class MD5Deduper : public Deduper {
- public:
- MD5Deduper( ) {
- }
-
- virtual ~MD5Deduper( ) {
- }
-
- virtual bool contentSeen( const URL url, RewindInputStream *s ) {
- (void)url;
- (void)s;
- return false;
- }
-};
-
-#endif
diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp
index 524a0e1..8ecccc3 100644
--- a/src/ModuleLoader.hpp
+++ b/src/ModuleLoader.hpp
@@ -81,6 +81,8 @@ class BaseModuleLoader {
template< typename Interface, typename CtorParams = NullType >
class ModuleLoader;
+// no param
+
template< typename Interface >
class ModuleLoader< Interface, NullType > : public BaseModuleLoader< Interface, NullType >
{
@@ -106,29 +108,113 @@ class ModuleLoader< Interface, NullType > : public BaseModuleLoader< Interface,
}
};
-template< typename Interface, typename P1 >
-class ModuleLoader< Interface, TYPELIST_1( P1 ) > : public BaseModuleLoader< Interface, TYPELIST_1( P1 ) >
+// one param
+
+template< typename Interface, typename T1 >
+class ModuleLoader< Interface, TYPELIST_1( T1 ) > : public BaseModuleLoader< Interface, TYPELIST_1( T1 ) >
+{
+ public:
+
+ ModuleLoader< Interface, TYPELIST_1( T1 ) >( const std::vector<std::string> files )
+ : BaseModuleLoader< Interface, TYPELIST_1( T1 ) >( files ) { }
+
+ Interface *create( std::string subclass, T1 t1 )
+ {
+ typename BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.find( subclass );
+ if( it == BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.end( ) ) {
+ throw std::runtime_error( "calling unknown constructor" );
+ }
+
+ Interface *obj = (*it).second.registry->create( t1 );
+
+ std::string clazz = demangle( typeid( *obj ) );
+
+ BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
+
+ return obj;
+ }
+};
+
+// two params
+
+template< typename Interface, typename T1, typename T2 >
+class ModuleLoader< Interface, TYPELIST_2( T1, T2 ) > : public BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >
+{
+ public:
+
+ ModuleLoader< Interface, TYPELIST_2( T1, T2 ) >( const std::vector<std::string> files )
+ : BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >( files ) { }
+
+ Interface *create( std::string subclass, T1 t1, T2 t2 )
+ {
+ typename BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.find( subclass );
+ if( it == BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.end( ) ) {
+ throw std::runtime_error( "calling unknown constructor" );
+ }
+
+ Interface *obj = (*it).second.registry->create( t1, t2 );
+
+ std::string clazz = demangle( typeid( *obj ) );
+
+ BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
+
+ return obj;
+ }
+};
+
+// three params
+
+template< typename Interface, typename T1, typename T2, typename T3 >
+class ModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) > : public BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >
{
public:
- ModuleLoader< Interface, TYPELIST_1( P1 ) >( const std::vector<std::string> files )
- : BaseModuleLoader< Interface, TYPELIST_1( P1 ) >( files ) { }
+ ModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >( const std::vector<std::string> files )
+ : BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >( files ) { }
- Interface *create( std::string subclass, P1 p1 )
+ Interface *create( std::string subclass, T1 t1, T2 t2, T3 t3 )
{
- typename BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.find( subclass );
- if( it == BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.end( ) ) {
+ typename BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.find( subclass );
+ if( it == BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.end( ) ) {
throw std::runtime_error( "calling unknown constructor" );
}
- Interface *obj = (*it).second.registry->create( p1 );
+ Interface *obj = (*it).second.registry->create( t1, t2, t3 );
std::string clazz = demangle( typeid( *obj ) );
- BaseModuleLoader< Interface, TYPELIST_1( P1 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
+ BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
return obj;
}
};
+// four params
+
+template< typename Interface, typename T1, typename T2, typename T3, typename T4 >
+class ModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) > : public BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >
+{
+ public:
+
+ ModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >( const std::vector<std::string> files )
+ : BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >( files ) { }
+
+ Interface *create( std::string subclass, T1 t1, T2 t2, T3 t3, T4 t4 )
+ {
+ typename BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.find( subclass );
+ if( it == BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.end( ) ) {
+ throw std::runtime_error( "calling unknown constructor" );
+ }
+
+ Interface *obj = (*it).second.registry->create( t1, t2, t3, t4 );
+
+ std::string clazz = demangle( typeid( *obj ) );
+
+ BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.insert( std::make_pair( clazz, (*it).second ) );
+
+ return obj;
+ }
+};
+
+
#endif
diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp
index 055cb1d..2b9f441 100644
--- a/src/ModuleRegistry.hpp
+++ b/src/ModuleRegistry.hpp
@@ -33,9 +33,50 @@ struct ModuleRegistry< Interface, TYPELIST_1( P1 ) > {
{
}
};
+
+template< typename Interface, typename P1, typename P2 >
+struct ModuleRegistry< Interface, TYPELIST_2( P1, P2 ) > {
+ std::string name;
+ Interface *(*create)( P1, P2 );
+ void (*destroy)( Interface *obj );
+
+ ModuleRegistry( std::string _name, Interface *(*_create)( P1, P2 ),
+ void (*_destroy)( Interface *obj ) )
+ : name( _name ), create( _create ), destroy( _destroy )
+ {
+ }
+};
+
+template< typename Interface, typename P1, typename P2, typename P3 >
+struct ModuleRegistry< Interface, TYPELIST_3( P1, P2, P3 ) > {
+ std::string name;
+ Interface *(*create)( P1, P2, P3 );
+ void (*destroy)( Interface *obj );
+
+ ModuleRegistry( std::string _name, Interface *(*_create)( P1, P2, P3 ),
+ void (*_destroy)( Interface *obj ) )
+ : name( _name ), create( _create ), destroy( _destroy )
+ {
+ }
+};
+
+template< typename Interface, typename P1, typename P2, typename P3, typename P4 >
+struct ModuleRegistry< Interface, TYPELIST_4( P1, P2, P3, P4 ) > {
+ std::string name;
+ Interface *(*create)( P1, P2, P3, P4 );
+ void (*destroy)( Interface *obj );
+
+ ModuleRegistry( std::string _name, Interface *(*_create)( P1, P2, P3, P4 ),
+ void (*_destroy)( Interface *obj ) )
+ : name( _name ), create( _create ), destroy( _destroy )
+ {
+ }
+};
#ifdef SHARED
+// no param macro
+
#define DECLARE_MODULE( baseClass ) \
extern ModuleRegistry<baseClass> registry ## _ ## baseClass;
@@ -52,13 +93,15 @@ static void destroy( baseClass *obj ) \
\
ModuleRegistry<baseClass> registry ## _ ## baseClass( name, &create, &destroy );
+// 1 param macro
+
#define DECLARE_MODULE_1( baseClass, T1 ) \
extern ModuleRegistry<baseClass, TYPELIST_1( T1 ) > registry ## _ ## baseClass;
#define REGISTER_MODULE_1( name, baseClass, subClass, T1 ) \
-static baseClass *create( T1 t ) \
+static baseClass *create( T1 t1 ) \
{ \
- return new subClass( t ); \
+ return new subClass( t1 ); \
} \
\
static void destroy( baseClass *obj ) \
@@ -68,13 +111,73 @@ static void destroy( baseClass *obj ) \
\
ModuleRegistry<baseClass, TYPELIST_1( T1 )> registry ## _ ## baseClass( name, &create, &destroy );
+// 2 param macro
+
+#define DECLARE_MODULE_2( baseClass, T1, T2 ) \
+ extern ModuleRegistry<baseClass, TYPELIST_2( T1, T2 ) > registry ## _ ## baseClass;
+
+#define REGISTER_MODULE_2( name, baseClass, subClass, T1, T2 ) \
+static baseClass *create( T1 t1, T2 t2 ) \
+{ \
+ return new subClass( t1, t2 ); \
+} \
+ \
+static void destroy( baseClass *obj ) \
+{ \
+ delete obj; \
+} \
+ \
+ModuleRegistry<baseClass, TYPELIST_2( T1, T2 )> registry ## _ ## baseClass( name, &create, &destroy );
+
+// 3 param macro
+
+#define DECLARE_MODULE_3( baseClass, T1, T2, T3 ) \
+ extern ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 ) > registry ## _ ## baseClass;
+
+#define REGISTER_MODULE_3( name, baseClass, subClass, T1, T2, T3 ) \
+static baseClass *create( T1 t1, T2 t2, T3 t3 ) \
+{ \
+ return new subClass( t1, t2, t3 ); \
+} \
+ \
+static void destroy( baseClass *obj ) \
+{ \
+ delete obj; \
+} \
+ \
+ModuleRegistry<baseClass, TYPELIST_3( T1, T2, T3 )> registry ## _ ## baseClass( name, &create, &destroy );
+
+// 4 param macro
+
+#define DECLARE_MODULE_4( baseClass, T1, T2, T3, T4 ) \
+ extern ModuleRegistry<baseClass, TYPELIST_4( T1, T2, T3, T4 ) > registry ## _ ## baseClass;
+
+#define REGISTER_MODULE_4( name, baseClass, subClass, T1, T2, T3, T4 ) \
+static baseClass *create( T1 t1, T2 t2, T3 t3, T4 t4 ) \
+{ \
+ return new subClass( t1, t2, t3, t4 ); \
+} \
+ \
+static void destroy( baseClass *obj ) \
+{ \
+ delete obj; \
+} \
+ \
+ModuleRegistry<baseClass, TYPELIST_4( T1, T2, T3, T4 )> registry ## _ ## baseClass( name, &create, &destroy );
+
#else // SHARED
#define DECLARE_MODULE( baseClass )
-#define DECLARE_MODULE_1( baseClass, T )
+#define DECLARE_MODULE_1( baseClass, T1 )
+#define DECLARE_MODULE_2( baseClass, T1, T2 )
+#define DECLARE_MODULE_3( baseClass, T1, T2, T3 )
+#define DECLARE_MODULE_4( baseClass, T1, T2, T3, T4 )
#define REGISTER_MODULE( name, baseClass, subClass )
-#define REGISTER_MODULE_1( name, baseClass, subClass, T )
+#define REGISTER_MODULE_1( name, baseClass, subClass, T1 )
+#define REGISTER_MODULE_2( name, baseClass, subClass, T1, T2 )
+#define REGISTER_MODULE_3( name, baseClass, subClass, T1, T2, T3 )
+#define REGISTER_MODULE_4( name, baseClass, subClass, T1, T2, T3, T4 )
#endif // SHARED
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 1c0576f..0fb8697 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -1,12 +1,15 @@
-#include "LibFetchFetcher.hpp"
-#include "MemoryFrontier.hpp"
-#include "MD5Deduper.hpp"
-#include "HTMLLinkExtractProcessor.hpp"
-#include "MemoryURLSeen.hpp"
+#include "Fetcher.hpp"
+#include "Frontier.hpp"
+#include "Deduper.hpp"
+#include "Processor.hpp"
+#include "URLSeen.hpp"
#include "URLNormalizer.hpp"
#include "URLFilter.hpp"
+
#include "ModuleLoader.hpp"
+#include "Logger.hpp"
+
#include <set>
#include <vector>
#include <list>
@@ -31,10 +34,30 @@ int main( void )
filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" );
ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules );
- Frontier *frontier = new MemoryFrontier( );
- Fetcher *fetcher = new LibFetchFetcher( );
- Deduper *deduper = new MD5Deduper( );
- URLSeen *urlSeen = new MemoryURLSeen( );
+ vector<string> frontierModules;
+ frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" );
+ ModuleLoader<Frontier> frontiers( frontierModules );
+
+ vector<string> fetcherModules;
+ fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" );
+ ModuleLoader<Fetcher> fetchers( fetcherModules );
+
+ vector<string> urlseenModules;
+ urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" );
+ ModuleLoader<URLSeen> urlSeens( urlseenModules );
+
+ vector<string> deduperModules;
+ deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" );
+ ModuleLoader<Deduper> dedupers( deduperModules );
+
+ vector<string> processorModules;
+ processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" );
+ ModuleLoader<Processor, TYPELIST_4( URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) > processors( processorModules );
+
+ Frontier *frontier = frontiers.create( "memory" );
+ Fetcher *fetcher = fetchers.create( "libfetch" );
+ Deduper *deduper = dedupers.create( "null" );
+ URLSeen *urlSeen = urlSeens.create( "memory" );
set<string> protocols;
protocols.insert( "http" );
@@ -52,7 +75,8 @@ int main( void )
URLNormalizer *normalizer = urlNormalizers.create( "google" );
- Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, chainFilter, urlSeen );
+ Processor *processor = processors.create( "htmllinkextract",
+ normalizer, frontier, chainFilter, urlSeen );
LOG( logNOTICE ) << "Crawler started..";
@@ -79,10 +103,10 @@ int main( void )
urlChainFilter.destroy( chainFilter );
urlFilters.destroy( protocolFilter );
urlFilters.destroy( hostFilter );
- delete urlSeen;
- delete deduper;
- delete fetcher;
- delete frontier;
+ urlSeens.destroy( urlSeen );
+ dedupers.destroy( deduper );
+ fetchers.destroy( fetcher );
+ frontiers.destroy( frontier );
LOG( logNOTICE ) << "Crawler stopped..";
diff --git a/src/modules/GNUmakefile b/src/modules/GNUmakefile
index cd45705..31dc26c 100644
--- a/src/modules/GNUmakefile
+++ b/src/modules/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ../..
-SUBDIRS = urlnormalizer urlfilter
+SUBDIRS = urlnormalizer urlfilter frontier fetcher urlseen deduper processor
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/src/modules/deduper/GNUmakefile b/src/modules/deduper/GNUmakefile
new file mode 100644
index 0000000..1a63a91
--- /dev/null
+++ b/src/modules/deduper/GNUmakefile
@@ -0,0 +1,17 @@
+TOPDIR = ../../..
+
+SUBDIRS = null
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/modules/deduper/null/GNUmakefile b/src/modules/deduper/null/GNUmakefile
new file mode 100644
index 0000000..ae9663f
--- /dev/null
+++ b/src/modules/deduper/null/GNUmakefile
@@ -0,0 +1,39 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a
+
+DYNAMIC_MODULE = \
+ mod_deduper_null.so
+
+STATIC_LIB = \
+ libnulldeduper.a
+
+CPP_OBJS = \
+ NullDeduper.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/deduper/null/NullDeduper.cpp b/src/modules/deduper/null/NullDeduper.cpp
new file mode 100644
index 0000000..9eca5c4
--- /dev/null
+++ b/src/modules/deduper/null/NullDeduper.cpp
@@ -0,0 +1,3 @@
+#include "NullDeduper.hpp"
+
+REGISTER_MODULE( "null", Deduper, NullDeduper )
diff --git a/src/modules/deduper/null/NullDeduper.hpp b/src/modules/deduper/null/NullDeduper.hpp
new file mode 100644
index 0000000..90de4dd
--- /dev/null
+++ b/src/modules/deduper/null/NullDeduper.hpp
@@ -0,0 +1,24 @@
+#ifndef __NULLDEDUPER_H
+#define __NULLDEDUPER_H
+
+#include "Deduper.hpp"
+#include "ModuleRegistry.hpp"
+
+class NullDeduper : public Deduper {
+ public:
+ NullDeduper( ) {
+ }
+
+ virtual ~NullDeduper( ) {
+ }
+
+ virtual bool contentSeen( const URL url, RewindInputStream *s ) {
+ (void)url;
+ (void)s;
+ return false;
+ }
+};
+
+DECLARE_MODULE( Deduper )
+
+#endif
diff --git a/src/modules/fetcher/GNUmakefile b/src/modules/fetcher/GNUmakefile
new file mode 100644
index 0000000..526e9e5
--- /dev/null
+++ b/src/modules/fetcher/GNUmakefile
@@ -0,0 +1,17 @@
+TOPDIR = ../../..
+
+SUBDIRS = libfetch
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/modules/fetcher/libfetch/GNUmakefile b/src/modules/fetcher/libfetch/GNUmakefile
new file mode 100644
index 0000000..707a4c0
--- /dev/null
+++ b/src/modules/fetcher/libfetch/GNUmakefile
@@ -0,0 +1,42 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src \
+ -I$(TOPDIR)/libfetch
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a \
+ $(TOPDIR)/libfetch/libfetch.a
+
+DYNAMIC_MODULE = \
+ mod_fetcher_libfetch.so
+
+STATIC_LIB = \
+ liblibfetchfetcher.a
+
+CPP_OBJS = \
+ LibFetchFetcher.o \
+ LibFetchRewindInputStream.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/LibFetchFetcher.cpp b/src/modules/fetcher/libfetch/LibFetchFetcher.cpp
index 976543f..5b770a7 100644
--- a/src/LibFetchFetcher.cpp
+++ b/src/modules/fetcher/libfetch/LibFetchFetcher.cpp
@@ -8,3 +8,5 @@ RewindInputStream *LibFetchFetcher::fetch( const URL url )
LibFetchRewindInputStream *s = new LibFetchRewindInputStream( url );
return s;
}
+
+REGISTER_MODULE( "libfetch", Fetcher, LibFetchFetcher )
diff --git a/src/LibFetchFetcher.hpp b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp
index 5cb4677..1103612 100644
--- a/src/LibFetchFetcher.hpp
+++ b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp
@@ -2,6 +2,7 @@
#define __LIBFETCH_FETCHER_H
#include "Fetcher.hpp"
+#include "ModuleRegistry.hpp"
class LibFetchFetcher : public Fetcher
{
@@ -15,4 +16,6 @@ class LibFetchFetcher : public Fetcher
virtual RewindInputStream *fetch( const URL url );
};
+DECLARE_MODULE( Fetcher )
+
#endif
diff --git a/src/LibFetchRewindInputStream.cpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp
index 4e837c8..4e837c8 100644
--- a/src/LibFetchRewindInputStream.cpp
+++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp
diff --git a/src/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
index f1896df..f1896df 100644
--- a/src/LibFetchRewindInputStream.hpp
+++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
diff --git a/src/modules/frontier/GNUmakefile b/src/modules/frontier/GNUmakefile
new file mode 100644
index 0000000..b7b54ae
--- /dev/null
+++ b/src/modules/frontier/GNUmakefile
@@ -0,0 +1,17 @@
+TOPDIR = ../../..
+
+SUBDIRS = memory
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/modules/frontier/memory/GNUmakefile b/src/modules/frontier/memory/GNUmakefile
new file mode 100644
index 0000000..0d81f07
--- /dev/null
+++ b/src/modules/frontier/memory/GNUmakefile
@@ -0,0 +1,39 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a
+
+DYNAMIC_MODULE = \
+ mod_frontier_memory.so
+
+STATIC_LIB = \
+ libmemoryfrontier.a
+
+CPP_OBJS = \
+ MemoryFrontier.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/frontier/memory/MemoryFrontier.cpp b/src/modules/frontier/memory/MemoryFrontier.cpp
new file mode 100644
index 0000000..ada78dd
--- /dev/null
+++ b/src/modules/frontier/memory/MemoryFrontier.cpp
@@ -0,0 +1,3 @@
+#include "MemoryFrontier.hpp"
+
+REGISTER_MODULE( "memory", Frontier, MemoryFrontier )
diff --git a/src/MemoryFrontier.hpp b/src/modules/frontier/memory/MemoryFrontier.hpp
index 68f1906..d488d37 100644
--- a/src/MemoryFrontier.hpp
+++ b/src/modules/frontier/memory/MemoryFrontier.hpp
@@ -2,13 +2,15 @@
#define __MEMORY_FRONTIER_H
#include "Frontier.hpp"
+#include "ModuleRegistry.hpp"
#include "Logger.hpp"
#include <queue>
class MemoryFrontier : public Frontier {
public:
- virtual ~MemoryFrontier( ) {
+ virtual ~MemoryFrontier( )
+ {
}
URL getNextUrl( ) {
@@ -29,4 +31,6 @@ class MemoryFrontier : public Frontier {
queue<URL> m_urls;
};
+DECLARE_MODULE( Frontier )
+
#endif
diff --git a/src/modules/processor/GNUmakefile b/src/modules/processor/GNUmakefile
new file mode 100644
index 0000000..8bfd814
--- /dev/null
+++ b/src/modules/processor/GNUmakefile
@@ -0,0 +1,17 @@
+TOPDIR = ../../..
+
+SUBDIRS = htmllinkextract
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/modules/processor/htmllinkextract/GNUmakefile b/src/modules/processor/htmllinkextract/GNUmakefile
new file mode 100644
index 0000000..b32a980
--- /dev/null
+++ b/src/modules/processor/htmllinkextract/GNUmakefile
@@ -0,0 +1,41 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src \
+ -I$(TOPDIR)/streamhtmlparser
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a \
+ $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a
+
+DYNAMIC_MODULE = \
+ mod_processor_htmllinkextract.so
+
+STATIC_LIB = \
+ libhtmllinkextractprocessor.a
+
+CPP_OBJS = \
+ HTMLLinkExtractProcessor.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/HTMLLinkExtractProcessor.cpp b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp
index 8479ede..78e7b31 100644
--- a/src/HTMLLinkExtractProcessor.cpp
+++ b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp
@@ -67,3 +67,5 @@ void HTMLLinkExtractProcessor::process( RewindInputStream *s )
m_parser.Reset( );
}
+
+REGISTER_MODULE_4( "htmllinkextract", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
diff --git a/src/HTMLLinkExtractProcessor.hpp b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.hpp
index 04b0600..d5c6bc6 100644
--- a/src/HTMLLinkExtractProcessor.hpp
+++ b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.hpp
@@ -6,6 +6,7 @@
#include "Frontier.hpp"
#include "URLFilter.hpp"
#include "URLSeen.hpp"
+#include "ModuleRegistry.hpp"
#include "htmlparser_cpp.h"
@@ -24,4 +25,6 @@ class HTMLLinkExtractProcessor : public Processor {
URL m_baseUrl;
};
+DECLARE_MODULE_4( Processor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * )
+
#endif
diff --git a/src/modules/urlseen/GNUmakefile b/src/modules/urlseen/GNUmakefile
new file mode 100644
index 0000000..b7b54ae
--- /dev/null
+++ b/src/modules/urlseen/GNUmakefile
@@ -0,0 +1,17 @@
+TOPDIR = ../../..
+
+SUBDIRS = memory
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/modules/urlseen/memory/GNUmakefile b/src/modules/urlseen/memory/GNUmakefile
new file mode 100644
index 0000000..73395f8
--- /dev/null
+++ b/src/modules/urlseen/memory/GNUmakefile
@@ -0,0 +1,39 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a
+
+DYNAMIC_MODULE = \
+ mod_urlseen_memory.so
+
+STATIC_LIB = \
+ libmemoryurlseen.a
+
+CPP_OBJS = \
+ MemoryURLSeen.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/MemoryURLSeen.cpp b/src/modules/urlseen/memory/MemoryURLSeen.cpp
index 3d14aa2..15149e9 100644
--- a/src/MemoryURLSeen.cpp
+++ b/src/modules/urlseen/memory/MemoryURLSeen.cpp
@@ -20,3 +20,5 @@ bool MemoryURLSeen::seen( const URL url )
return hasSeen;
}
+
+REGISTER_MODULE( "memory", URLSeen, MemoryURLSeen )
diff --git a/src/MemoryURLSeen.hpp b/src/modules/urlseen/memory/MemoryURLSeen.hpp
index 6e6ccbd..35dcc4f 100644
--- a/src/MemoryURLSeen.hpp
+++ b/src/modules/urlseen/memory/MemoryURLSeen.hpp
@@ -2,6 +2,7 @@
#define __MEMORY_URLSEEN_H
#include "URLSeen.hpp"
+#include "ModuleRegistry.hpp"
#include <set>
@@ -15,4 +16,6 @@ class MemoryURLSeen : public URLSeen {
set<URL> m_urls;
};
+DECLARE_MODULE( URLSeen )
+
#endif
diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile
index 6ca1f96..6a9104a 100644
--- a/tests/url/GNUmakefile
+++ b/tests/url/GNUmakefile
@@ -2,8 +2,8 @@ TOPDIR = ../..
SUBDIRS =
-INCLUDE_CXXFLAGS = \
- -DUSE_MODULELOADER
+#INCLUDE_CXXFLAGS = \
+# -DUSE_MODULELOADER
INCLUDE_DIRS = \
-I$(TOPDIR)/src \