diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-12 16:30:33 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-12 16:30:33 +0200 |
commit | 561646e33cee7d2437b2732c484d2ea22312e925 (patch) | |
tree | 11e4899f6b413b179a48d57ef0576b25514463de | |
parent | bb12c70e96e7fa3b13335dbfe877dfc68861237f (diff) | |
download | crawler-561646e33cee7d2437b2732c484d2ea22312e925.tar.gz crawler-561646e33cee7d2437b2732c484d2ea22312e925.tar.bz2 |
better naming of modules
23 files changed, 86 insertions, 46 deletions
diff --git a/src/Makefile.W32 b/src/Makefile.W32 index a3dd050..b1fd26f 100755 --- a/src/Makefile.W32 +++ b/src/Makefile.W32 @@ -48,8 +48,8 @@ local_distclean: local_test: copy_prereq: + @-copy "$(ICU_DIR)\bin\icuuc49.dll" . >NUL + @-copy "$(ICU_DIR)\bin\icudt49.dll" . >NUL run: copy_prereq - @-echo echo Running Crawlingwolf... > test.bat - @-echo crawlingwolf.exe >> test.bat - @-test.bat + @-crawlingwolf.exe diff --git a/src/ModuleLoader.hpp b/src/ModuleLoader.hpp index 8b00750..07328e8 100755 --- a/src/ModuleLoader.hpp +++ b/src/ModuleLoader.hpp @@ -6,6 +6,7 @@ #include <string> #include <stdexcept> #include <typeinfo> +#include <sstream> #ifndef _WIN32 #include <dlfcn.h> @@ -125,7 +126,9 @@ class ModuleLoader< Interface, NullType > : public BaseModuleLoader< Interface, { typename BaseModuleLoader< Interface >::mapType::const_iterator it = BaseModuleLoader< Interface >::m_modules.find( subclass ); if( it == BaseModuleLoader< Interface >::m_modules.end( ) ) { - throw std::runtime_error( "calling unknown constructor" ); + std::ostringstream ss; + ss << "calling unknown constructor for class '" << subclass << "'"; + throw std::runtime_error( ss.str( ) ); } Interface *obj = (*it).second.registry->create( ); @@ -152,7 +155,9 @@ class ModuleLoader< Interface, TYPELIST_1( T1 ) > : public BaseModuleLoader< Int { typename BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.find( subclass ); if( it == BaseModuleLoader< Interface, TYPELIST_1( T1 ) >::m_modules.end( ) ) { - throw std::runtime_error( "calling unknown constructor" ); + std::ostringstream ss; + ss << "calling unknown constructor for class '" << subclass << "'"; + throw std::runtime_error( ss.str( ) ); } Interface *obj = (*it).second.registry->create( t1 ); @@ -179,7 +184,9 @@ class ModuleLoader< Interface, TYPELIST_2( T1, T2 ) > : public BaseModuleLoader< { typename BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.find( subclass ); if( it == BaseModuleLoader< Interface, TYPELIST_2( T1, T2 ) >::m_modules.end( ) ) { - throw std::runtime_error( "calling unknown constructor" ); + std::ostringstream ss; + ss << "calling unknown constructor for class '" << subclass << "'"; + throw std::runtime_error( ss.str( ) ); } Interface *obj = (*it).second.registry->create( t1, t2 ); @@ -206,7 +213,9 @@ class ModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) > : public BaseModuleLoa { typename BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.find( subclass ); if( it == BaseModuleLoader< Interface, TYPELIST_3( T1, T2, T3 ) >::m_modules.end( ) ) { - throw std::runtime_error( "calling unknown constructor" ); + std::ostringstream ss; + ss << "calling unknown constructor for class '" << subclass << "'"; + throw std::runtime_error( ss.str( ) ); } Interface *obj = (*it).second.registry->create( t1, t2, t3 ); @@ -233,7 +242,9 @@ class ModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) > : public BaseModul { typename BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::mapType::const_iterator it = BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.find( subclass ); if( it == BaseModuleLoader< Interface, TYPELIST_4( T1, T2, T3, T4 ) >::m_modules.end( ) ) { - throw std::runtime_error( "calling unknown constructor" ); + std::ostringstream ss; + ss << "calling unknown constructor for class '" << subclass << "'"; + throw std::runtime_error( ss.str( ) ); } Interface *obj = (*it).second.registry->create( t1, t2, t3, t4 ); diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index 288769b..f96ecbb 100755 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -77,6 +77,7 @@ int main( void ) normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); #else normalizerModules.push_back( ".\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple.dll" ); + normalizerModules.push_back( ".\\modules\\urlnormalizer\\googleurl\\mod_urlnormalizer_googleurl.dll" ); #endif ModuleLoader<URLNormalizer> urlNormalizers( normalizerModules ); @@ -84,12 +85,17 @@ int main( void ) #ifndef _WIN32 filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); +#else + normalizerModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" ); + normalizerModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" ); #endif ModuleLoader<URLFilter, TYPELIST_1( const set<string> ) > urlFilters( filterModules ); vector<string> filterChainModules; #ifndef _WIN32 filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); +#else + normalizerModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" ); #endif ModuleLoader<URLFilter, TYPELIST_1( const list<URLFilter *> ) > urlChainFilter( filterChainModules ); @@ -129,30 +135,30 @@ int main( void ) #endif ModuleLoader<TypeDetect> typeDetectors( typeDetectModules ); - Frontier *frontier = frontiers.create( "memory" ); - Fetcher *fetcher = fetchers.create( "libfetch" ); - Deduper *deduper = dedupers.create( "null" ); - URLSeen *urlSeen = urlSeens.create( "memory" ); - TypeDetect *typeDetect = typeDetectors.create( "libmagic" ); + Frontier *frontier = frontiers.create( "memory_frontier" ); + Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); + Deduper *deduper = dedupers.create( "null_deduper" ); + URLSeen *urlSeen = urlSeens.create( "memory_urlseen" ); + TypeDetect *typeDetect = typeDetectors.create( "libmagic_typedetect" ); set<string> protocols; protocols.insert( "http" ); protocols.insert( "https" ); - URLFilter *protocolFilter = urlFilters.create( "protocol", protocols ); + URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); set<string> hosts; hosts.insert( "www.andreasbaumann.cc" ); - URLFilter *hostFilter = urlFilters.create( "host", hosts ); + URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); list<URLFilter *> filters; filters.push_back( hostFilter ); filters.push_back( protocolFilter ); - URLFilter *chainFilter = urlChainFilter.create( "chain", filters ); + URLFilter *chainFilter = urlChainFilter.create( "chain_urlfilter", filters ); - URLNormalizer *normalizer = urlNormalizers.create( "google" ); -// URLNormalizer *normalizer = urlNormalizers.create( "simple" ); + URLNormalizer *normalizer = urlNormalizers.create( "google_urlnormalizer" ); +// URLNormalizer *normalizer = urlNormalizers.create( "simple_urlnormalizer" ); - Processor *htmlParser = processors.create( "htmllinkextract", + Processor *htmlParser = processors.create( "htmllinkextract_processor", normalizer, frontier, chainFilter, urlSeen ); LOG( logNOTICE ) << "Crawler started.."; diff --git a/src/modules/deduper/null/NullDeduper.cpp b/src/modules/deduper/null/NullDeduper.cpp index 9eca5c4..6d56cb0 100644 --- a/src/modules/deduper/null/NullDeduper.cpp +++ b/src/modules/deduper/null/NullDeduper.cpp @@ -1,3 +1,3 @@ #include "NullDeduper.hpp" -REGISTER_MODULE( "null", Deduper, NullDeduper ) +REGISTER_MODULE( "null_deduper", Deduper, NullDeduper ) diff --git a/src/modules/fetcher/file/FileFetcher.cpp b/src/modules/fetcher/file/FileFetcher.cpp index 8d66e14..b344b3f 100644 --- a/src/modules/fetcher/file/FileFetcher.cpp +++ b/src/modules/fetcher/file/FileFetcher.cpp @@ -9,4 +9,4 @@ RewindInputStream *FileFetcher::fetch( const URL url ) return s; } -REGISTER_MODULE( "file", Fetcher, FileFetcher ) +REGISTER_MODULE( "file_fetcher", Fetcher, FileFetcher ) diff --git a/src/modules/fetcher/libfetch/LibFetchFetcher.cpp b/src/modules/fetcher/libfetch/LibFetchFetcher.cpp index 5b770a7..9cbc926 100644 --- a/src/modules/fetcher/libfetch/LibFetchFetcher.cpp +++ b/src/modules/fetcher/libfetch/LibFetchFetcher.cpp @@ -9,4 +9,4 @@ RewindInputStream *LibFetchFetcher::fetch( const URL url ) return s; } -REGISTER_MODULE( "libfetch", Fetcher, LibFetchFetcher ) +REGISTER_MODULE( "libfetch_fetcher", Fetcher, LibFetchFetcher ) diff --git a/src/modules/fetcher/winhttp/Makefile.W32 b/src/modules/fetcher/winhttp/Makefile.W32 index 49874e0..ddf751a 100755 --- a/src/modules/fetcher/winhttp/Makefile.W32 +++ b/src/modules/fetcher/winhttp/Makefile.W32 @@ -14,7 +14,9 @@ INCLUDE_DIRS = \ INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawlingwolf.lib + $(TOPDIR)\src\crawlingwolf.lib \ + WinHttp.lib + DYNAMIC_MODULE = \ mod_fetcher_winhttp.dll diff --git a/src/modules/fetcher/winhttp/WinHttpFetcher.cpp b/src/modules/fetcher/winhttp/WinHttpFetcher.cpp index 06ab550..a22ab1a 100755 --- a/src/modules/fetcher/winhttp/WinHttpFetcher.cpp +++ b/src/modules/fetcher/winhttp/WinHttpFetcher.cpp @@ -1,10 +1,24 @@ #include "WinHttpFetcher.hpp" #include "WinHttpRewindInputStream.hpp" +WinHttpFetcher::WinHttpFetcher( ) + : m_session( 0 ) +{ + m_session = WinHttpOpen( L"WinHTTP CrawlingWolf/0.0.1", + WINHTTP_ACCESS_TYPE_DEFAULT_PROXY, + WINHTTP_NO_PROXY_NAME, + WINHTTP_NO_PROXY_BYPASS, 0 ); +} + +WinHttpFetcher::~WinHttpFetcher( ) +{ + WinHttpCloseHandle( m_session ); +} + RewindInputStream *WinHttpFetcher::fetch( const URL url ) { WinHttpRewindInputStream *s = new WinHttpRewindInputStream( url ); return s; } -REGISTER_MODULE( "winhttp", Fetcher, WinHttpFetcher ) +REGISTER_MODULE( "winhttp_fetcher", Fetcher, WinHttpFetcher ) diff --git a/src/modules/fetcher/winhttp/WinHttpFetcher.hpp b/src/modules/fetcher/winhttp/WinHttpFetcher.hpp index a731da6..5854738 100755 --- a/src/modules/fetcher/winhttp/WinHttpFetcher.hpp +++ b/src/modules/fetcher/winhttp/WinHttpFetcher.hpp @@ -4,16 +4,23 @@ #include "Fetcher.hpp" #include "ModuleRegistry.hpp" +#define WIN32_MEAN_AND_LEAN +#include <windows.h> +#include <winhttp.h> + class WinHttpFetcher : public Fetcher { public: - WinHttpFetcher( ) { - } + WinHttpFetcher( ); - virtual ~WinHttpFetcher( ) { - } + virtual ~WinHttpFetcher( ); virtual RewindInputStream *fetch( const URL url ); + + HINTERNET &session( ) { return m_session; } + + private: + HINTERNET m_session; }; DECLARE_MODULE( Fetcher ) diff --git a/src/modules/frontier/memory/MemoryFrontier.cpp b/src/modules/frontier/memory/MemoryFrontier.cpp index ada78dd..2311353 100644 --- a/src/modules/frontier/memory/MemoryFrontier.cpp +++ b/src/modules/frontier/memory/MemoryFrontier.cpp @@ -1,3 +1,3 @@ #include "MemoryFrontier.hpp" -REGISTER_MODULE( "memory", Frontier, MemoryFrontier ) +REGISTER_MODULE( "memory_frontier", Frontier, MemoryFrontier ) diff --git a/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp index 78e7b31..f575a0c 100644 --- a/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp +++ b/src/modules/processor/htmllinkextract/HTMLLinkExtractProcessor.cpp @@ -68,4 +68,4 @@ void HTMLLinkExtractProcessor::process( RewindInputStream *s ) m_parser.Reset( ); } -REGISTER_MODULE_4( "htmllinkextract", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) +REGISTER_MODULE_4( "htmllinkextract_processor", Processor, HTMLLinkExtractProcessor, URLNormalizer *, Frontier *, URLFilter *, URLSeen * ) diff --git a/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp b/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp index b65d6eb..cdc8926 100644 --- a/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp +++ b/src/modules/typedetect/libmagic/LibMagicTypeDetect.cpp @@ -61,4 +61,4 @@ MIMEType LibMagicTypeDetect::detect( RewindInputStream *s ) return MIMEType( res ); } -REGISTER_MODULE( "libmagic", TypeDetect, LibMagicTypeDetect ) +REGISTER_MODULE( "libmagic_typedetect", TypeDetect, LibMagicTypeDetect ) diff --git a/src/modules/urlfilter/chain/ChainURLFilter.cpp b/src/modules/urlfilter/chain/ChainURLFilter.cpp index fc2de93..4dcf493 100644 --- a/src/modules/urlfilter/chain/ChainURLFilter.cpp +++ b/src/modules/urlfilter/chain/ChainURLFilter.cpp @@ -16,4 +16,4 @@ bool ChainURLFilter::filter( const URL url ) return true; } -REGISTER_MODULE_1( "chain", URLFilter, ChainURLFilter, const std::list<URLFilter *> ) +REGISTER_MODULE_1( "chain_urlfiler", URLFilter, ChainURLFilter, const std::list<URLFilter *> ) diff --git a/src/modules/urlfilter/host/HostURLFilter.cpp b/src/modules/urlfilter/host/HostURLFilter.cpp index 6981a36..dd03910 100644 --- a/src/modules/urlfilter/host/HostURLFilter.cpp +++ b/src/modules/urlfilter/host/HostURLFilter.cpp @@ -18,4 +18,4 @@ bool HostURLFilter::filter( const URL url ) return res; } -REGISTER_MODULE_1( "host", URLFilter, HostURLFilter, const std::set<std::string> ) +REGISTER_MODULE_1( "host_urlfilter", URLFilter, HostURLFilter, const std::set<std::string> ) diff --git a/src/modules/urlfilter/protocol/ProtocolURLFilter.cpp b/src/modules/urlfilter/protocol/ProtocolURLFilter.cpp index e50dcc1..96168b7 100644 --- a/src/modules/urlfilter/protocol/ProtocolURLFilter.cpp +++ b/src/modules/urlfilter/protocol/ProtocolURLFilter.cpp @@ -18,4 +18,4 @@ bool ProtocolURLFilter::filter( const URL url ) return res; } -REGISTER_MODULE_1( "protocol", URLFilter, ProtocolURLFilter, const std::set<std::string> ) +REGISTER_MODULE_1( "protocol_urlfilter", URLFilter, ProtocolURLFilter, const std::set<std::string> ) diff --git a/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp index 7e5edde..3db781c 100755 --- a/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp +++ b/src/modules/urlnormalizer/googleurl/GoogleURLNormalizer.cpp @@ -104,4 +104,4 @@ URL GoogleURLNormalizer::normalize( const URL url, const string s ) "", "" ); } -REGISTER_MODULE( "google", URLNormalizer, GoogleURLNormalizer ) +REGISTER_MODULE( "google_urlnormalizer", URLNormalizer, GoogleURLNormalizer ) diff --git a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp index 87b3794..b2dd34f 100755 --- a/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp +++ b/src/modules/urlnormalizer/simpleurl/SimpleURLNormalizer.cpp @@ -150,4 +150,4 @@ void SimpleURLNormalizer::normalizePath( string &path ) } } -REGISTER_MODULE( "simple", URLNormalizer, SimpleURLNormalizer ) +REGISTER_MODULE( "simple_urlnormalizer", URLNormalizer, SimpleURLNormalizer ) diff --git a/src/modules/urlseen/memory/MemoryURLSeen.cpp b/src/modules/urlseen/memory/MemoryURLSeen.cpp index 15149e9..e7bc9e6 100644 --- a/src/modules/urlseen/memory/MemoryURLSeen.cpp +++ b/src/modules/urlseen/memory/MemoryURLSeen.cpp @@ -21,4 +21,4 @@ bool MemoryURLSeen::seen( const URL url ) return hasSeen; } -REGISTER_MODULE( "memory", URLSeen, MemoryURLSeen ) +REGISTER_MODULE( "memory_urlseen", URLSeen, MemoryURLSeen ) diff --git a/tests/typedetect/GNUmakefile b/tests/typedetect/GNUmakefile index 1e3db6e..e7df7bf 100644 --- a/tests/typedetect/GNUmakefile +++ b/tests/typedetect/GNUmakefile @@ -49,7 +49,7 @@ local_distclean: local_test: @-echo "Using fetcher 'file'.." - @-for METHOD in libmagic; do \ + @-for METHOD in libmagic_typedetect; do \ echo "Using MIME type detector '$$METHOD'.." ; \ ./exec_test test1 test1 "detect a simple C++ file" $$METHOD file file://localhost/`pwd`/test1.cpp ; \ ./exec_test test1 test2 "detect a M$ word file" $$METHOD file file://localhost/`pwd`/test2.doc ; \ @@ -59,7 +59,7 @@ local_test: ./exec_test test1 test6 "detect a Javascript file" $$METHOD file file://localhost/`pwd`/test6.js ; \ done @-echo "Using fetcher 'libfetch'" - @-for METHOD in libmagic; do \ + @-for METHOD in libmagic_typedetect; do \ echo "Using MIME type detector '$$METHOD'.." ; \ ./exec_test test1 test100 "detect a HTML file" $$METHOD libfetch http://www.andreasbaumann.cc ; \ done diff --git a/tests/typedetect/test1.cpp b/tests/typedetect/test1.cpp index 96bed30..cef3742 100644 --- a/tests/typedetect/test1.cpp +++ b/tests/typedetect/test1.cpp @@ -36,7 +36,7 @@ int main( int argc, char *argv[] ) TypeDetect *typeDetect = typeDetectors.create( method ); #else TypeDetect *typeDetect; - if( strcmp( method, "libmagic" ) == 0 ) { + if( strcmp( method, "libmagic_typedetect" ) == 0 ) { typeDetect = new LibMagicTypeDetect( ); } else { cerr << "Unknown type detection method '" << method << "'" << endl; @@ -48,9 +48,9 @@ int main( int argc, char *argv[] ) #error TODO #else Fetcher *fetcher; - if( strcmp( fetcherMethod, "file" ) == 0 ) { + if( strcmp( fetcherMethod, "file_fetcher" ) == 0 ) { fetcher = new FileFetcher( ); - } else if( strcmp( fetcherMethod, "libfetch" ) == 0 ) { + } else if( strcmp( fetcherMethod, "libfetch_fetcher" ) == 0 ) { fetcher = new LibFetchFetcher( ); } else { cerr << "Unknown fetcher method '" << fetcherMethod << "'" << endl; diff --git a/tests/url/GNUmakefile b/tests/url/GNUmakefile index 6a9104a..69d882d 100644 --- a/tests/url/GNUmakefile +++ b/tests/url/GNUmakefile @@ -35,7 +35,7 @@ local_clean: local_distclean: local_test: - @-for METHOD in simple google; do \ + @-for METHOD in simple_urlnormalizer google_urlnormalizer; do \ echo "Using URL normalizer '$$METHOD'.." ; \ ./exec_test test1 test1 "parse illegal protocol" $$METHOD parse www.andreasbaumann.cc ; \ ./exec_test test1 test2 "parse normal start URL without slash" $$METHOD parse http://www.andreasbaumann.cc ; \ diff --git a/tests/url/Makefile.W32 b/tests/url/Makefile.W32 index f2f74ee..27c140e 100755 --- a/tests/url/Makefile.W32 +++ b/tests/url/Makefile.W32 @@ -42,7 +42,7 @@ local_distclean: local_test: @-copy "$(ICU_DIR)\bin\icuuc49.dll" . >NUL @-copy "$(ICU_DIR)\bin\icudt49.dll" . >NUL - @-for %%m in ( simple google ) do \ + @-for %%m in ( simple_urlnormalizer google_urlnormalizer ) do \ @echo Using URL normalizer '%m'.. & \ @exec_test test1 test1 "parse illegal protocol" %m parse www.andreasbaumann.cc & \ @exec_test test1 test2 "parse normal start URL without slash" %m parse http://www.andreasbaumann.cc & \ diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp index 95db9fb..3dd6df2 100755 --- a/tests/url/test1.cpp +++ b/tests/url/test1.cpp @@ -41,9 +41,9 @@ int main( int argc, char *argv[] ) URLNormalizer *normalizer = urlNormalizers.create( method ); #else URLNormalizer *normalizer; - if( strcmp( method, "simple" ) == 0 ) { + if( strcmp( method, "simple_urlnormalizer" ) == 0 ) { normalizer = new SimpleURLNormalizer( ); - } else if( strcmp( method, "google" ) == 0 ) { + } else if( strcmp( method, "google_urlnormalizer" ) == 0 ) { normalizer = new GoogleURLNormalizer( ); } else { cerr << "Unknown normalization method '" << method << "'" << endl; |