summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-06 17:16:08 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-06 17:16:08 +0200
commit01bcb80ac096de72694135dff37e2ff70c2ab572 (patch)
tree453cd7da8b1cdb67bee7a1eb1d450d94db6239ea
parente59855fd87bea3641846d6b589059230b08043f1 (diff)
downloadcrawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.gz
crawler-01bcb80ac096de72694135dff37e2ff70c2ab572.tar.bz2
first steps to make URL loader loadable
-rw-r--r--docs/LINKS2
-rw-r--r--src/Deduper.cpp5
-rw-r--r--src/Deduper.hpp2
-rw-r--r--src/GNUmakefile14
-rw-r--r--src/ModuleLoader.hpp81
-rw-r--r--src/ModuleRegistry.hpp21
-rw-r--r--src/crawlingwolf.cpp15
-rw-r--r--src/modules/GNUmakefile18
-rw-r--r--src/modules/urlnormalizer/GNUmakefile18
-rw-r--r--src/modules/urlnormalizer/googleurl/GNUmakefile38
-rw-r--r--src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp (renamed from src/GoogleURLNormalizer.cpp)12
-rw-r--r--src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.hpp (renamed from src/GoogleURLNormalizer.hpp)3
-rw-r--r--src/modules/urlnormalizer/simpleurl/GNUmakefile35
-rw-r--r--src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp (renamed from src/SimpleURLNormalizer.cpp)11
-rw-r--r--src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.hpp (renamed from src/SimpleURLNormalizer.hpp)3
15 files changed, 257 insertions, 21 deletions
diff --git a/docs/LINKS b/docs/LINKS
index e7c85bd..bca6daa 100644
--- a/docs/LINKS
+++ b/docs/LINKS
@@ -39,3 +39,5 @@ http://www.ibm.com/developerworks/linux/library/l-embed-lua/
Loadable modules in C++
http://www.isotton.com/devel/docs/C++-dlopen-mini-HOWTO/C++-dlopen-mini-HOWTO.html
+http://www.linuxjournal.com/article/3687?page=0,1
+http://www.artima.com/cppsource/subscription_problem.html
diff --git a/src/Deduper.cpp b/src/Deduper.cpp
deleted file mode 100644
index 451b8ab..0000000
--- a/src/Deduper.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-#include "Deduper.hpp"
-
-Deduper::~Deduper( )
-{
-}
diff --git a/src/Deduper.hpp b/src/Deduper.hpp
index 36421fa..3cb33c1 100644
--- a/src/Deduper.hpp
+++ b/src/Deduper.hpp
@@ -7,7 +7,7 @@
class Deduper
{
public:
- virtual ~Deduper( ) = 0;
+ virtual ~Deduper( ) { };
virtual bool contentSeen( const URL url, RewindInputStream *s ) = 0;
};
diff --git a/src/GNUmakefile b/src/GNUmakefile
index eaf57c8..5a25794 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ..
-SUBDIRS =
+SUBDIRS = modules
-include $(TOPDIR)/makefiles/gmake/platform.mk
@@ -10,14 +10,11 @@ INCLUDE_LDFLAGS = \
INCLUDE_DIRS = \
-I$(TOPDIR)/libfetch \
- -I$(TOPDIR)/streamhtmlparser \
- -I$(TOPDIR)/googleurl
+ -I$(TOPDIR)/streamhtmlparser
INCLUDE_LIBS = \
$(TOPDIR)/libfetch/libfetch.a \
- $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a \
- $(TOPDIR)/googleurl/libgoogleurl.a \
- -licui18n -licuuc
+ $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a
# openssl
ifeq ($(WITH_SSL),1)
@@ -33,14 +30,11 @@ LOCAL_STATIC_LIB_OBJS = \
URL.o \
LibFetchFetcher.o \
LibFetchRewindInputStream.o \
- Deduper.o \
HTMLLinkExtractProcessor.o \
ProtocolURLFilter.o \
HostURLFilter.o \
ChainURLFilter.o \
- MemoryURLSeen.o \
- SimpleURLNormalizer.o \
- GoogleURLNormalizer.o
+ MemoryURLSeen.o
CPP_OBJS = \
$(LOCAL_STATIC_LIB_OBJS)
diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp
new file mode 100644
index 0000000..c4b73dc
--- /dev/null
+++ b/src/ModuleLoader.hpp
@@ -0,0 +1,81 @@
+#ifndef __MODULELOADER_H
+#define __MODULELOADER_H
+
+#include <vector>
+#include <map>
+#include <string>
+#include <stdexcept>
+
+#include <dlfcn.h>
+
+#include "ModuleRegistry.hpp"
+
+template< typename Interface >
+struct Module {
+ void *handle;
+ ModuleRegistry<Interface> *registry;
+};
+
+template< typename Interface >
+class ModuleLoader {
+
+ typedef typename std::map<std::string, Module< Interface > > mapType;
+
+ protected:
+ mapType m_modules;
+
+ public:
+
+ ModuleLoader<Interface>( )
+ {
+ }
+
+ ModuleLoader<Interface>( const std::vector<std::string> files )
+ {
+ Module<Interface> m;
+
+ for( std::vector<string>::const_iterator it = files.begin( ); it != files.end( ); it++ ) {
+ m.handle = dlopen( it->c_str( ), RTLD_NOW );
+ if( !m.handle ) {
+ throw std::runtime_error( dlerror( ) );
+ }
+
+ m.registry = static_cast<ModuleRegistry<Interface> *>( dlsym( m.handle, "registry" ) );
+ if( !m.registry ) {
+ dlclose( m.handle );
+ throw std::runtime_error( "missing module registry" );
+ }
+
+ m_modules[m.registry->name] = m;
+ }
+ }
+
+ ~ModuleLoader<Interface>( )
+ {
+ for( typename mapType::const_iterator it = m_modules.begin( ); it != m_modules.end( ); it++ ) {
+ dlclose( (*it).second.handle );
+ }
+ }
+
+ Interface *create( std::string subclass ) const
+ {
+ typename mapType::const_iterator it = m_modules.find( subclass );
+ if( it == m_modules.end( ) ) {
+ throw std::runtime_error( "calling unknown constructor" );
+ }
+
+ return (*it).second.registry->create( );
+ }
+
+ void destroy( std::string subclass, Interface *obj ) const
+ {
+ typename mapType::const_iterator it = m_modules.find( subclass );
+ if( it == m_modules.end( ) ) {
+ throw std::runtime_error( "calling unknown destructor" );
+ }
+
+ (*it).second.registry->destroy( obj );
+ }
+};
+
+#endif
diff --git a/src/ModuleRegistry.hpp b/src/ModuleRegistry.hpp
new file mode 100644
index 0000000..fbbdd40
--- /dev/null
+++ b/src/ModuleRegistry.hpp
@@ -0,0 +1,21 @@
+#ifndef __MODULEINTERFACE_H
+#define __MODULEINTERFACE_H
+
+#include <string>
+
+template< typename Interface >
+struct ModuleRegistry {
+ std::string name;
+ Interface *(*create)( );
+ void (*destroy)( Interface *obj );
+
+ ModuleRegistry( ) { }
+
+ ModuleRegistry<Interface>( std::string _name, Interface *(*_create)( ),
+ void (*_destroy)( Interface *obj ) )
+ : name( _name ), create( _create ), destroy( _destroy )
+ {
+ }
+};
+
+#endif
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 080423d..213f9a5 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -6,10 +6,11 @@
#include "ProtocolURLFilter.hpp"
#include "HostURLFilter.hpp"
#include "MemoryURLSeen.hpp"
-#include "SimpleURLNormalizer.hpp"
-#include "GoogleURLNormalizer.hpp"
+#include "URLNormalizer.hpp"
+#include "ModuleLoader.hpp"
#include <set>
+#include <vector>
using namespace std;
@@ -33,8 +34,12 @@ int main( void )
ChainURLFilter filters( &protocolFilter, &hostFilter );
- //URLNormalizer *normalizer = new SimpleURLNormalizer( );
- URLNormalizer *normalizer = new GoogleURLNormalizer( );
+ vector<string> modules;
+ modules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
+ modules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
+ ModuleLoader<URLNormalizer> urlNormalizers( modules );
+ //URLNormalizer *normalizer = urlNormalizers.create( "simple" );
+ URLNormalizer *normalizer = urlNormalizers.create( "google" );
Processor *processor = new HTMLLinkExtractProcessor( normalizer, frontier, &filters, urlSeen );
@@ -59,7 +64,7 @@ int main( void )
}
delete processor;
- delete normalizer;
+ urlNormalizers.destroy( "google", normalizer );
delete urlSeen;
delete deduper;
delete fetcher;
diff --git a/src/modules/GNUmakefile b/src/modules/GNUmakefile
new file mode 100644
index 0000000..ddf5ee4
--- /dev/null
+++ b/src/modules/GNUmakefile
@@ -0,0 +1,18 @@
+TOPDIR = ../..
+
+SUBDIRS = urlnormalizer
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/urlnormalizer/GNUmakefile b/src/modules/urlnormalizer/GNUmakefile
new file mode 100644
index 0000000..83e369c
--- /dev/null
+++ b/src/modules/urlnormalizer/GNUmakefile
@@ -0,0 +1,18 @@
+TOPDIR = ../../..
+
+SUBDIRS = simpleurl googleurl
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/modules/urlnormalizer/googleurl/GNUmakefile b/src/modules/urlnormalizer/googleurl/GNUmakefile
new file mode 100644
index 0000000..cd52be9
--- /dev/null
+++ b/src/modules/urlnormalizer/googleurl/GNUmakefile
@@ -0,0 +1,38 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src \
+ -I$(TOPDIR)/googleurl
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)/googleurl/libgoogleurl.a \
+ -licui18n -licuuc
+
+DYNAMIC_MODULE = \
+ mod_urlnormalizer_googleurl.so
+
+CPP_OBJS = \
+ GoogleURLNormalizer.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/GoogleURLNormalizer.cpp b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp
index 023a9e4..e5810d6 100644
--- a/src/GoogleURLNormalizer.cpp
+++ b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp
@@ -103,3 +103,15 @@ URL GoogleURLNormalizer::normalize( const URL url, const string s )
componentString( canonical, parsed.path ),
"", "" );
}
+
+static URLNormalizer *create( )
+{
+ return new GoogleURLNormalizer( );
+}
+
+static void destroy( URLNormalizer *obj )
+{
+ delete obj;
+}
+
+ModuleRegistry<URLNormalizer> registry( "google", &create, &destroy );
diff --git a/src/GoogleURLNormalizer.hpp b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.hpp
index 1aa33bf..7fd3cfb 100644
--- a/src/GoogleURLNormalizer.hpp
+++ b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.hpp
@@ -2,6 +2,7 @@
#define __GOOGLEURLNORMALIZER_H
#include "URLNormalizer.hpp"
+#include "ModuleRegistry.hpp"
class GoogleURLNormalizer : public URLNormalizer {
public:
@@ -14,4 +15,6 @@ class GoogleURLNormalizer : public URLNormalizer {
virtual URL normalize( const URL url, const std::string s );
};
+extern "C" ModuleRegistry<URLNormalizer> registry;
+
#endif
diff --git a/src/modules/urlnormalizer/simpleurl/GNUmakefile b/src/modules/urlnormalizer/simpleurl/GNUmakefile
new file mode 100644
index 0000000..b6fc0a0
--- /dev/null
+++ b/src/modules/urlnormalizer/simpleurl/GNUmakefile
@@ -0,0 +1,35 @@
+TOPDIR = ../../../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_DIRS = \
+ -I. -I$(TOPDIR)/src
+
+INCLUDE_CXXFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+
+DYNAMIC_MODULE = \
+ mod_urlnormalizer_simple.so
+
+CPP_OBJS = \
+ SimpleURLNormalizer.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
+
diff --git a/src/SimpleURLNormalizer.cpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
index 058dd6e..328a82b 100644
--- a/src/SimpleURLNormalizer.cpp
+++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp
@@ -146,3 +146,14 @@ void SimpleURLNormalizer::normalizePath( string &path )
}
}
+static URLNormalizer *create( )
+{
+ return new SimpleURLNormalizer( );
+}
+
+static void destroy( URLNormalizer *obj )
+{
+ delete obj;
+}
+
+ModuleRegistry<URLNormalizer> registry( "simple", &create, &destroy );
diff --git a/src/SimpleURLNormalizer.hpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.hpp
index de478a4..1badaef 100644
--- a/src/SimpleURLNormalizer.hpp
+++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.hpp
@@ -2,6 +2,7 @@
#define __SIMPLEURLNORMALIZER_H
#include "URLNormalizer.hpp"
+#include "ModuleRegistry.hpp"
class SimpleURLNormalizer : public URLNormalizer {
public:
@@ -15,4 +16,6 @@ class SimpleURLNormalizer : public URLNormalizer {
void normalizePath( std::string &path );
};
+extern "C" ModuleRegistry<URLNormalizer> registry;
+
#endif