From 13fc9a7da5111f4ddba942d3c6b6b8654ce395d6 Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Thu, 6 Sep 2012 22:18:23 +0200 Subject: more splitting into libcrawl, crawl binary moved more public header to 'include' changed approach for dynamic linking on Windows --- .gitignore | 2 +- include/crawler/CrawlerExportable.hpp | 26 +++ include/crawler/DNSResolver.hpp | 7 + include/crawler/Deduper.hpp | 15 ++ include/crawler/Fetcher.hpp | 15 ++ include/crawler/Frontier.hpp | 16 ++ include/crawler/MIMEType.hpp | 100 ++++++++++ include/crawler/Processor.hpp | 13 ++ include/crawler/RewindInputStream.hpp | 32 +++ include/crawler/SpoolRewindInputStream.hpp | 51 +++++ include/crawler/URL.hpp | 140 +++++++++++++ include/crawler/URLFilter.hpp | 14 ++ include/crawler/URLNormalizer.hpp | 17 ++ include/crawler/URLSeen.hpp | 12 ++ include/crawler/win32/errormsg.hpp | 10 + include/crawler/win32/stringutils.hpp | 10 + include/logger/ConsoleLogSink.hpp | 2 +- include/logger/FileLogSink.hpp | 2 +- include/logger/LogSink.hpp | 4 +- include/logger/Logger.hpp | 32 ++- include/logger/LoggerExportable.hpp | 26 +++ include/logger/SyslogLogSink.hpp | 6 +- include/logger/WinDbgLogSink.hpp | 2 +- include/util/Exportable.hpp | 26 --- include/util/NonCopyable.hpp | 4 +- include/util/ScopedPtr.hpp | 3 +- include/util/Singleton.hpp | 13 +- include/util/UtilExportable.hpp | 26 +++ src/DNSResolver.hpp | 7 - src/Deduper.hpp | 15 -- src/Fetcher.hpp | 15 -- src/Frontier.hpp | 16 -- src/GNUmakefile | 54 +---- src/MIMEType.cpp | 5 - src/MIMEType.hpp | 100 ---------- src/Makefile.W32 | 53 +---- src/Processor.hpp | 13 -- src/RewindInputStream.hpp | 31 --- src/SpoolRewindInputStream.cpp | 181 ----------------- src/SpoolRewindInputStream.hpp | 51 ----- src/URL.cpp | 5 - src/URL.hpp | 138 ------------- src/URLFilter.hpp | 14 -- src/URLNormalizer.hpp | 17 -- src/URLSeen.hpp | 12 -- src/crawl.cpp | 238 ----------------------- src/crawl/GNUmakefile | 55 ++++++ src/crawl/Makefile.W32 | 39 ++++ src/crawl/crawl.cpp | 238 +++++++++++++++++++++++ src/libcrawler/GNUmakefile | 42 ++++ src/libcrawler/MIMEType.cpp | 5 + src/libcrawler/Makefile.W32 | 45 +++++ src/libcrawler/SpoolRewindInputStream.cpp | 181 +++++++++++++++++ src/libcrawler/URL.cpp | 5 + src/libcrawler/win32/errormsg.cpp | 27 +++ src/libcrawler/win32/stringutils.cpp | 21 ++ src/logger/Makefile.W32 | 2 +- src/modules/deduper/null/Makefile.W32 | 13 +- src/modules/fetcher/file/Makefile.W32 | 5 +- src/modules/fetcher/winhttp/Makefile.W32 | 5 +- src/modules/frontier/memory/Makefile.W32 | 5 +- src/modules/urlfilter/chain/Makefile.W32 | 4 +- src/modules/urlfilter/host/Makefile.W32 | 5 +- src/modules/urlfilter/protocol/Makefile.W32 | 5 +- src/modules/urlnormalizer/googleurl/Makefile.W32 | 3 +- src/modules/urlnormalizer/simpleurl/Makefile.W32 | 5 +- src/win32/errormsg.cpp | 27 --- src/win32/errormsg.hpp | 8 - src/win32/stringutils.cpp | 21 -- src/win32/stringutils.hpp | 8 - tests/logger/Makefile.W32 | 4 +- tests/utils/Makefile.W32 | 5 +- tests/utils/test3.cpp | 2 - 73 files changed, 1261 insertions(+), 1115 deletions(-) create mode 100755 include/crawler/CrawlerExportable.hpp create mode 100644 include/crawler/DNSResolver.hpp create mode 100644 include/crawler/Deduper.hpp create mode 100755 include/crawler/Fetcher.hpp create mode 100644 include/crawler/Frontier.hpp create mode 100644 include/crawler/MIMEType.hpp create mode 100644 include/crawler/Processor.hpp create mode 100755 include/crawler/RewindInputStream.hpp create mode 100755 include/crawler/SpoolRewindInputStream.hpp create mode 100755 include/crawler/URL.hpp create mode 100644 include/crawler/URLFilter.hpp create mode 100644 include/crawler/URLNormalizer.hpp create mode 100644 include/crawler/URLSeen.hpp create mode 100755 include/crawler/win32/errormsg.hpp create mode 100755 include/crawler/win32/stringutils.hpp create mode 100755 include/logger/LoggerExportable.hpp delete mode 100755 include/util/Exportable.hpp create mode 100755 include/util/UtilExportable.hpp delete mode 100644 src/DNSResolver.hpp delete mode 100644 src/Deduper.hpp delete mode 100755 src/Fetcher.hpp delete mode 100644 src/Frontier.hpp delete mode 100644 src/MIMEType.cpp delete mode 100644 src/MIMEType.hpp delete mode 100644 src/Processor.hpp delete mode 100755 src/RewindInputStream.hpp delete mode 100644 src/SpoolRewindInputStream.cpp delete mode 100755 src/SpoolRewindInputStream.hpp delete mode 100644 src/URL.cpp delete mode 100644 src/URL.hpp delete mode 100644 src/URLFilter.hpp delete mode 100644 src/URLNormalizer.hpp delete mode 100644 src/URLSeen.hpp delete mode 100755 src/crawl.cpp create mode 100755 src/crawl/GNUmakefile create mode 100755 src/crawl/Makefile.W32 create mode 100755 src/crawl/crawl.cpp create mode 100755 src/libcrawler/GNUmakefile create mode 100644 src/libcrawler/MIMEType.cpp create mode 100755 src/libcrawler/Makefile.W32 create mode 100644 src/libcrawler/SpoolRewindInputStream.cpp create mode 100644 src/libcrawler/URL.cpp create mode 100755 src/libcrawler/win32/errormsg.cpp create mode 100755 src/libcrawler/win32/stringutils.cpp delete mode 100755 src/win32/errormsg.cpp delete mode 100755 src/win32/errormsg.hpp delete mode 100755 src/win32/stringutils.cpp delete mode 100755 src/win32/stringutils.hpp diff --git a/.gitignore b/.gitignore index 8d86f6e..72d32b9 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,6 @@ tests/*/test1 tests/*/test2 tests/*/test3 -src/crawl +src/crawl/crawl makefiles/gmake/platform.mk.vars makefiles/gmake/platform.vars diff --git a/include/crawler/CrawlerExportable.hpp b/include/crawler/CrawlerExportable.hpp new file mode 100755 index 0000000..5b89108 --- /dev/null +++ b/include/crawler/CrawlerExportable.hpp @@ -0,0 +1,26 @@ +#ifndef __CRAWLER_EXPORTABLE_H +#define __CRAWLER_EXPORTABLE_H + +#ifndef _WIN32 + +#define CRAWLER_DLL_VISIBLE + +#else + +#ifdef SHARED + +#ifdef BUILDING_CRAWLER +#define CRAWLER_DLL_VISIBLE __declspec(dllexport) +#else +#define CRAWLER_DLL_VISIBLE __declspec(dllimport) +#endif + +#else + +#define CRAWLER_DLL_VISIBLE + +#endif // BUILDING_CRAWLER + +#endif // _WIN32 + +#endif diff --git a/include/crawler/DNSResolver.hpp b/include/crawler/DNSResolver.hpp new file mode 100644 index 0000000..8f79734 --- /dev/null +++ b/include/crawler/DNSResolver.hpp @@ -0,0 +1,7 @@ +#ifndef __DNSRESOLVER_H +#define __DNSRESOLVER_H + +class DNSResolver { +}; + +#endif diff --git a/include/crawler/Deduper.hpp b/include/crawler/Deduper.hpp new file mode 100644 index 0000000..3cb33c1 --- /dev/null +++ b/include/crawler/Deduper.hpp @@ -0,0 +1,15 @@ +#ifndef __DEDUPER_H +#define __DEDUPER_H + +#include "URL.hpp" +#include "RewindInputStream.hpp" + +class Deduper +{ + public: + virtual ~Deduper( ) { }; + + virtual bool contentSeen( const URL url, RewindInputStream *s ) = 0; +}; + +#endif diff --git a/include/crawler/Fetcher.hpp b/include/crawler/Fetcher.hpp new file mode 100755 index 0000000..40f1c7a --- /dev/null +++ b/include/crawler/Fetcher.hpp @@ -0,0 +1,15 @@ +#ifndef __FETCHER_H +#define __FETCHER_H + +#include "URL.hpp" +#include "RewindInputStream.hpp" + +class Fetcher +{ + public: + virtual ~Fetcher( ) { }; + + virtual RewindInputStream *fetch( const URL url ) = 0; +}; + +#endif diff --git a/include/crawler/Frontier.hpp b/include/crawler/Frontier.hpp new file mode 100644 index 0000000..54c0dd6 --- /dev/null +++ b/include/crawler/Frontier.hpp @@ -0,0 +1,16 @@ +#ifndef __FRONTIER_H +#define __FRONTIER_H + +#include "URL.hpp" + +class Frontier +{ + public: + virtual ~Frontier( ) { }; + + virtual URL getNextUrl( ) = 0; + + virtual void addUrl( const URL url ) = 0; +}; + +#endif diff --git a/include/crawler/MIMEType.hpp b/include/crawler/MIMEType.hpp new file mode 100644 index 0000000..3a628ca --- /dev/null +++ b/include/crawler/MIMEType.hpp @@ -0,0 +1,100 @@ +#ifndef __MIMETYPE_H +#define __MIMETYPE_H + +#include +#include +#include +#include + +class MIMEType { + protected: + std::string m_type; + std::string m_subtype; + + public: + MIMEType( ) + : m_type( "" ), m_subtype( "" ) + { + } + + MIMEType( const std::string _type, const std::string _subtype ) + : m_type( _type ), m_subtype( _subtype ) + { + } + + MIMEType( const MIMEType &m ) + : m_type( m.m_type ), m_subtype( m.m_subtype ) + { + } + + MIMEType( const char *s ) + { + const char *pos; + if( ( pos = strchr( s, '/' ) ) == NULL ) { + *this = Null; + } else { + m_type = std::string( s, 0, pos - s ); + m_subtype = std::string( s, pos - s + 1, strlen( s ) - ( pos - s + 1 ) ); + } + } + + MIMEType& operator=( const MIMEType &m ) + { + if( this != &m ) { + this->m_type = m.m_type; + this->m_subtype = m.m_subtype; + } + return *this; + } + + const std::string type( ) const + { + return m_type; + } + + const std::string subtype( ) const + { + return m_subtype; + } + + std::string str( ) const + { + std::ostringstream os; + os << *this; + return os.str( ); + } + + static MIMEType Null; + + bool operator!=( const MIMEType &other ) const + { + return( str( ) != other.str( ) ); + } + + bool operator==( const MIMEType &other ) const + { + return( str( ) == other.str( ) ); + } + + bool operator<( const MIMEType &other ) const + { + return( str( ) < other.str( ) ); + } + + template< typename CharT, typename TraitsT > friend + std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream&s, const MIMEType& m ); +}; + +template< typename CharT, typename TraitsT > +inline std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream< CharT, TraitsT > &s, const MIMEType &m ) +{ + if( m.type( ).empty( ) ) { + return s; + } + + s << m.type( ) << "/" << m.subtype( ); + + return s; +} + +#endif diff --git a/include/crawler/Processor.hpp b/include/crawler/Processor.hpp new file mode 100644 index 0000000..bc17ec0 --- /dev/null +++ b/include/crawler/Processor.hpp @@ -0,0 +1,13 @@ +#ifndef __PROCESSOR_H +#define __PROCESSOR_H + +#include "RewindInputStream.hpp" + +class Processor { + public: + virtual ~Processor( ) { } + + virtual void process( RewindInputStream *s ) = 0; +}; + +#endif diff --git a/include/crawler/RewindInputStream.hpp b/include/crawler/RewindInputStream.hpp new file mode 100755 index 0000000..6bbe80c --- /dev/null +++ b/include/crawler/RewindInputStream.hpp @@ -0,0 +1,32 @@ +#ifndef __REWIND_INPUT_STREAM_H +#define __REWIND_INPUT_STREAM_H + +#include "CrawlerExportable.hpp" +#include "URL.hpp" + +#include +#include + +class RewindInputStream : public std::istream { + public: + const URL getBaseUrl( ) const + { + return m_baseUrl; + } + + CRAWLER_DLL_VISIBLE virtual void rewind( ) = 0; + + virtual std::string lastErrMsg( ) const = 0; + + protected: + + RewindInputStream( const URL &url ) + : std::istream( 0 ), m_baseUrl( url ) + { + } + + private: + URL m_baseUrl; +}; + +#endif diff --git a/include/crawler/SpoolRewindInputStream.hpp b/include/crawler/SpoolRewindInputStream.hpp new file mode 100755 index 0000000..f065271 --- /dev/null +++ b/include/crawler/SpoolRewindInputStream.hpp @@ -0,0 +1,51 @@ +#ifndef __SPOOLREWINDINPUTSTREAM_H +#define __SPOOLREWINDINPUTSTREAM_H + +#include "RewindInputStream.hpp" + +#include +#include +#include + +class spool_streambuf : public std::streambuf +{ + public: + explicit CRAWLER_DLL_VISIBLE spool_streambuf( size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 ); + + CRAWLER_DLL_VISIBLE ~spool_streambuf( ); + + CRAWLER_DLL_VISIBLE void rewind( ); + + protected: + CRAWLER_DLL_VISIBLE virtual std::streambuf::int_type readFromSource( ) = 0; + + private: + CRAWLER_DLL_VISIBLE int_type underflow( ); + + private: + const size_t m_putBack; + std::vector m_spoolBuf; + size_t m_spoolBufPos; + size_t m_spoolBufSize; + std::fstream m_spoolFile; + enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state; + + protected: + std::vector m_buf; + char *m_base; + char *m_start; +}; + +class SpoolRewindInputStream : public RewindInputStream +{ + public: + CRAWLER_DLL_VISIBLE SpoolRewindInputStream( const URL &url ); + CRAWLER_DLL_VISIBLE virtual ~SpoolRewindInputStream( ); + + CRAWLER_DLL_VISIBLE virtual void rewind( ); + + protected: + spool_streambuf *m_buf; +}; + +#endif diff --git a/include/crawler/URL.hpp b/include/crawler/URL.hpp new file mode 100755 index 0000000..255a2db --- /dev/null +++ b/include/crawler/URL.hpp @@ -0,0 +1,140 @@ +#ifndef __URL_H +#define __URL_H + +#include "CrawlerExportable.hpp" + +#include +#include +#include + +using namespace std; + +class URL { + protected: + string m_protocol; + string m_host; + unsigned short m_port; + string m_path; + string m_query; + string m_fragment; + + public: + URL( ) + : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_query( "" ), m_fragment( "" ) + { + } + + URL( const URL& url ) + : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_query( url.m_query ), m_fragment( url.m_fragment ) + { + } + + URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _query, const std::string _fragment ) + : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_query( _query ), m_fragment( _fragment ) + { + } + + URL& operator=( const URL& u ) { + if( this != &u ) { + this->m_protocol = u.m_protocol; + this->m_port = u.m_port; + this->m_host = u.m_host; + this->m_path = u.m_path; + this->m_query = u.m_query; + this->m_fragment = u.m_fragment; + } + return *this; + } + + const string protocol( ) const + { + return m_protocol; + } + + const string host( ) const + { + return m_host; + } + + unsigned short port( ) const + { + return m_port; + } + + const string path( ) const + { + return m_path; + } + + const string query( ) const + { + return m_query; + } + + std::string fragment( ) const + { + return m_fragment; + } + + std::string str( ) const + { + std::ostringstream os; + os << *this; + return os.str( ); + } + + static URL CRAWLER_DLL_VISIBLE Null; + + bool operator!=( const URL &other ) const + { + return( str( ) != other.str( ) ); + } + + bool operator==( const URL &other ) const + { + return( str( ) == other.str( ) ); + } + + bool operator<( const URL &other ) const + { + return( str( ) < other.str( ) ); + } + + template< typename CharT, typename TraitsT > friend + basic_ostream& operator<<( basic_ostream&s, const URL& u ); + + static unsigned short defaultPort( const std::string p ) + { + if( p == "http" ) return 80; + else if( p == "https" ) return 443; + else if( p == "ftp" ) return 21; + else return 0; + } +}; + +template< typename CharT, typename TraitsT > +inline basic_ostream& operator<<( basic_ostream&s, const URL& u ) { + if( u.protocol( ).empty( ) ) { + return s; + } + + s << u.protocol( ) << "://" << u.host( ); + + if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) { + s << ":" << u.port( ); + } + + s << u.path( ); + + if( !u.query( ).empty( ) ) { + s << "?" << u.query( ); + } + + if( !u.fragment( ).empty( ) ) { + s << "#" << u.fragment( ); + } + + return s; +} + +#endif diff --git a/include/crawler/URLFilter.hpp b/include/crawler/URLFilter.hpp new file mode 100644 index 0000000..2136009 --- /dev/null +++ b/include/crawler/URLFilter.hpp @@ -0,0 +1,14 @@ +#ifndef __URLFILTER_H +#define __URLFILTER_H + +#include "URL.hpp" + +class URLFilter +{ + public: + virtual ~URLFilter( ) { }; + + virtual bool filter( const URL url ) = 0; +}; + +#endif diff --git a/include/crawler/URLNormalizer.hpp b/include/crawler/URLNormalizer.hpp new file mode 100644 index 0000000..af1781a --- /dev/null +++ b/include/crawler/URLNormalizer.hpp @@ -0,0 +1,17 @@ +#ifndef __URLNORMALIZER_H +#define __URLNORMALIZER_H + +#include + +#include "URL.hpp" + +class URLNormalizer { + public: + virtual ~URLNormalizer( ) { }; + + virtual URL parseUrl( const std::string s ) = 0; + + virtual URL normalize( const URL url, const std::string s ) = 0; +}; + +#endif diff --git a/include/crawler/URLSeen.hpp b/include/crawler/URLSeen.hpp new file mode 100644 index 0000000..742c863 --- /dev/null +++ b/include/crawler/URLSeen.hpp @@ -0,0 +1,12 @@ +#ifndef __URLSEEN_H +#define __URLSEEN_H + +#include "URL.hpp" + +class URLSeen { + public: + virtual ~URLSeen( ) { }; + virtual bool seen( const URL url ) = 0; +}; + +#endif diff --git a/include/crawler/win32/errormsg.hpp b/include/crawler/win32/errormsg.hpp new file mode 100755 index 0000000..f1ceb93 --- /dev/null +++ b/include/crawler/win32/errormsg.hpp @@ -0,0 +1,10 @@ +#ifndef __ERRORMSG_H +#define __ERRORMSG_H + +#include + +#include "CrawlerExportable.hpp" + +CRAWLER_DLL_VISIBLE std::string getLastError( ); + +#endif diff --git a/include/crawler/win32/stringutils.hpp b/include/crawler/win32/stringutils.hpp new file mode 100755 index 0000000..b0404cd --- /dev/null +++ b/include/crawler/win32/stringutils.hpp @@ -0,0 +1,10 @@ +#ifndef __STRINGUTILS_H +#define __STRINGUTILS_H + +#include + +#include "CrawlerExportable.hpp" + +CRAWLER_DLL_VISIBLE std::wstring s2ws( const std::string &s ); + +#endif diff --git a/include/logger/ConsoleLogSink.hpp b/include/logger/ConsoleLogSink.hpp index 4380df8..75dea72 100755 --- a/include/logger/ConsoleLogSink.hpp +++ b/include/logger/ConsoleLogSink.hpp @@ -8,7 +8,7 @@ class ConsoleLogSink : public LogSink public: ConsoleLogSink( const LogLevel level ) : LogSink( level ) { } - DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ); + LOGGER_DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ); }; #endif diff --git a/include/logger/FileLogSink.hpp b/include/logger/FileLogSink.hpp index 0acb6b9..5ab9b86 100755 --- a/include/logger/FileLogSink.hpp +++ b/include/logger/FileLogSink.hpp @@ -13,7 +13,7 @@ class FileLogSink : public LogSink ~FileLogSink( ); - DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ); + LOGGER_DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ); private: std::string m_filename; diff --git a/include/logger/LogSink.hpp b/include/logger/LogSink.hpp index 4d07df0..06113f9 100755 --- a/include/logger/LogSink.hpp +++ b/include/logger/LogSink.hpp @@ -1,7 +1,7 @@ #ifndef __LOGSINK_H #define __LOGSINK_H -#include "Exportable.hpp" +#include "LoggerExportable.hpp" #include "Logger.hpp" class LogSink @@ -11,7 +11,7 @@ class LogSink virtual ~LogSink( ) { } - DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ) = 0; + LOGGER_DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ) = 0; void setReportingLevel( const LogLevel level ) { m_level = level; } diff --git a/include/logger/Logger.hpp b/include/logger/Logger.hpp index f0c0012..451ca45 100755 --- a/include/logger/Logger.hpp +++ b/include/logger/Logger.hpp @@ -1,7 +1,7 @@ #ifndef __LOGGER_H #define __LOGGER_H -#include "Exportable.hpp" +#include "LoggerExportable.hpp" #include "Singleton.hpp" #include "ScopedPtr.hpp" @@ -30,34 +30,32 @@ class Logger : public Singleton< Logger > public: DECLARE_SINGLETON( Logger ) - DLL_VISIBLE void addSink( LogSink *sink ); - DLL_VISIBLE void removeSink( LogSink *sink ); - DLL_VISIBLE void log( const LogLevel level, const std::string &msg ); + LOGGER_DLL_VISIBLE void addSink( LogSink *sink ); + LOGGER_DLL_VISIBLE void removeSink( LogSink *sink ); + LOGGER_DLL_VISIBLE void log( const LogLevel level, const std::string &msg ); - DLL_VISIBLE static std::string toString( const LogLevel level ); - DLL_VISIBLE static LogLevel fromString( const std::string &s ); + LOGGER_DLL_VISIBLE static std::string toString( const LogLevel level ); + LOGGER_DLL_VISIBLE static LogLevel fromString( const std::string &s ); - DLL_VISIBLE void openConsoleLog( const LogLevel level ); - DLL_VISIBLE void openFileLog( const LogLevel level, const std::string &filename ); - DLL_VISIBLE void openSyslog( const LogLevel level, const std::string &ident, const std::string &facility ); - DLL_VISIBLE void openWinDbgLog( const LogLevel level ); + LOGGER_DLL_VISIBLE void openConsoleLog( const LogLevel level ); + LOGGER_DLL_VISIBLE void openFileLog( const LogLevel level, const std::string &filename ); + LOGGER_DLL_VISIBLE void openSyslog( const LogLevel level, const std::string &ident, const std::string &facility ); + LOGGER_DLL_VISIBLE void openWinDbgLog( const LogLevel level ); protected: - DLL_VISIBLE Logger( ); - DLL_VISIBLE virtual ~Logger( ); + LOGGER_DLL_VISIBLE Logger( ); + LOGGER_DLL_VISIBLE virtual ~Logger( ); private: scopedPtr< LoggerImpl > m_impl; }; -DEFINE_SINGLETON( Logger ) - class LogStream : private noncopyable, public std::ostringstream { public: - DLL_VISIBLE LogStream( Logger &logger, const LogLevel level ); - DLL_VISIBLE ~LogStream( ); - DLL_VISIBLE std::ostream &get( ); + LOGGER_DLL_VISIBLE LogStream( Logger &logger, const LogLevel level ); + LOGGER_DLL_VISIBLE ~LogStream( ); + LOGGER_DLL_VISIBLE std::ostream &get( ); private: LogStream( ); diff --git a/include/logger/LoggerExportable.hpp b/include/logger/LoggerExportable.hpp new file mode 100755 index 0000000..0445213 --- /dev/null +++ b/include/logger/LoggerExportable.hpp @@ -0,0 +1,26 @@ +#ifndef __LOGGER_EXPORTABLE_H +#define __LOGGER_EXPORTABLE_H + +#ifndef _WIN32 + +#define LOGGER_DLL_VISIBLE + +#else + +#ifdef SHARED + +#ifdef BUILDING_LOGGER +#define LOGGER_DLL_VISIBLE __declspec(dllexport) +#else +#define LOGGER_DLL_VISIBLE __declspec(dllimport) +#endif + +#else + +#define LOGGER_DLL_VISIBLE + +#endif // BUILDING_LOGGER + +#endif // _WIN32 + +#endif diff --git a/include/logger/SyslogLogSink.hpp b/include/logger/SyslogLogSink.hpp index f5824a6..058e126 100755 --- a/include/logger/SyslogLogSink.hpp +++ b/include/logger/SyslogLogSink.hpp @@ -12,10 +12,10 @@ class SyslogLogSink : public LogSink ~SyslogLogSink( ); - DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ); + LOGGER_DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ); - static int levelToSyslogLevel( const LogLevel level ); - static int facilityFromString( const std::string &facility ); + LOGGER_DLL_VISIBLE static int levelToSyslogLevel( const LogLevel level ); + LOGGER_DLL_VISIBLE static int facilityFromString( const std::string &facility ); private: std::string m_ident; diff --git a/include/logger/WinDbgLogSink.hpp b/include/logger/WinDbgLogSink.hpp index d073cdc..2fe8a43 100755 --- a/include/logger/WinDbgLogSink.hpp +++ b/include/logger/WinDbgLogSink.hpp @@ -8,7 +8,7 @@ class WinDbgLogSink : public LogSink public: WinDbgLogSink( const LogLevel level ) : LogSink( level ) { } - DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ); + LOGGER_DLL_VISIBLE virtual void log( const LogLevel level, const std::string &msg ); }; #endif diff --git a/include/util/Exportable.hpp b/include/util/Exportable.hpp deleted file mode 100755 index fc8269f..0000000 --- a/include/util/Exportable.hpp +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef __EXPORTABLE_H -#define __EXPORTABLE_H - -#ifndef _WIN32 - -#define DLL_VISIBLE -#define DLL_EXTERN - -#else - -#ifdef NODLL -#define DLL_VISIBLE -#define DLL_EXTERN -#else -#ifdef SHARED -#define DLL_VISIBLE __declspec(dllexport) -#define DLL_EXTERN -#else -#define DLL_VISIBLE __declspec(dllimport) -#define DLL_EXTERN -#endif -#endif - -#endif // _WIN32 - -#endif diff --git a/include/util/NonCopyable.hpp b/include/util/NonCopyable.hpp index 99804d2..22fdaf9 100755 --- a/include/util/NonCopyable.hpp +++ b/include/util/NonCopyable.hpp @@ -1,12 +1,12 @@ #ifndef __NONCOPYABLE_H #define __NONCOPYABLE_H -#include "Exportable.hpp" +#include "UtilExportable.hpp" namespace __dont_touch { -class DLL_VISIBLE noncopyable +class UTIL_DLL_VISIBLE noncopyable { protected: noncopyable( ) { } diff --git a/include/util/ScopedPtr.hpp b/include/util/ScopedPtr.hpp index 991409d..5fdd1f2 100755 --- a/include/util/ScopedPtr.hpp +++ b/include/util/ScopedPtr.hpp @@ -2,9 +2,10 @@ #define __SCOPEDPTR_H #include "NonCopyable.hpp" +#include "UtilExportable.hpp" template< typename T > -class DLL_VISIBLE scopedPtr : private noncopyable +class UTIL_DLL_VISIBLE scopedPtr : private noncopyable { public: explicit scopedPtr( T *p = 0 ) : m_p( p ) { } diff --git a/include/util/Singleton.hpp b/include/util/Singleton.hpp index f291435..80ab5e8 100755 --- a/include/util/Singleton.hpp +++ b/include/util/Singleton.hpp @@ -3,7 +3,7 @@ #include "ScopedPtr.hpp" #include "NonCopyable.hpp" -#include "Exportable.hpp" +#include "UtilExportable.hpp" #include "Noreturn.hpp" #include @@ -13,20 +13,11 @@ friend class Singleton< T >; \ friend class scopedPtr< T >; -#define DEFINE_SINGLETON( T ) - -#if 0 -#define DECLARE_SINGLETON( T ) \ - -#define DEFINE_SINGLETON( T ) \ - DLL_EXTERN template class DLL_VISIBLE Singleton< T >; -#endif - template< class T > class Singleton : private noncopyable { public: - DLL_VISIBLE static T& instance( ) + UTIL_DLL_VISIBLE static T& instance( ) { if( destroyed ) { onDeadReference( ); diff --git a/include/util/UtilExportable.hpp b/include/util/UtilExportable.hpp new file mode 100755 index 0000000..f9598f1 --- /dev/null +++ b/include/util/UtilExportable.hpp @@ -0,0 +1,26 @@ +#ifndef __UTIL_EXPORTABLE_H +#define __UTIL_EXPORTABLE_H + +#ifndef _WIN32 + +#define UTIL_DLL_VISIBLE + +#else + +#ifdef SHARED + +#ifdef BUILDING_UTIL +#define UTIL_DLL_VISIBLE __declspec(dllexport) +#else +#define UTIL_DLL_VISIBLE __declspec(dllimport) +#endif + +#else + +#define UTIL_DLL_VISIBLE + +#endif + +#endif + +#endif diff --git a/src/DNSResolver.hpp b/src/DNSResolver.hpp deleted file mode 100644 index 8f79734..0000000 --- a/src/DNSResolver.hpp +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __DNSRESOLVER_H -#define __DNSRESOLVER_H - -class DNSResolver { -}; - -#endif diff --git a/src/Deduper.hpp b/src/Deduper.hpp deleted file mode 100644 index 3cb33c1..0000000 --- a/src/Deduper.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __DEDUPER_H -#define __DEDUPER_H - -#include "URL.hpp" -#include "RewindInputStream.hpp" - -class Deduper -{ - public: - virtual ~Deduper( ) { }; - - virtual bool contentSeen( const URL url, RewindInputStream *s ) = 0; -}; - -#endif diff --git a/src/Fetcher.hpp b/src/Fetcher.hpp deleted file mode 100755 index 40f1c7a..0000000 --- a/src/Fetcher.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __FETCHER_H -#define __FETCHER_H - -#include "URL.hpp" -#include "RewindInputStream.hpp" - -class Fetcher -{ - public: - virtual ~Fetcher( ) { }; - - virtual RewindInputStream *fetch( const URL url ) = 0; -}; - -#endif diff --git a/src/Frontier.hpp b/src/Frontier.hpp deleted file mode 100644 index 54c0dd6..0000000 --- a/src/Frontier.hpp +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __FRONTIER_H -#define __FRONTIER_H - -#include "URL.hpp" - -class Frontier -{ - public: - virtual ~Frontier( ) { }; - - virtual URL getNextUrl( ) = 0; - - virtual void addUrl( const URL url ) = 0; -}; - -#endif diff --git a/src/GNUmakefile b/src/GNUmakefile index 97b934a..7367ce2 100755 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -1,69 +1,17 @@ TOPDIR = .. -SUBDIRS = logger modules - --include $(TOPDIR)/makefiles/gmake/platform.mk - -INCLUDE_CPPFLAGS = \ - - -INCLUDE_DIRS = \ - -I. \ - -I$(TOPDIR)/include/logger \ - -I$(TOPDIR)/include/util \ - -I$(TOPDIR)/include/module - -INCLUDE_LDFLAGS = \ - -L$(TOPDIR)/src/logger - -INCLUDE_LIBS = \ - -llogger - -# openssl -ifeq ($(WITH_SSL),1) - -INCLUDE_CFLAGS += \ - -DWITH_SSL - -INCLUDE_LIBS += \ - $(OPENSSL_LIBS) -endif - -STATIC_LIB = libcrawler.a - -DYNAMIC_LIB = libcrawler.so -DYNAMIC_LIB_MAJOR = 0 -DYNAMIC_LIB_MINOR = 0 -DYNAMIC_LIB_PATCH = 0 - -CPP_OBJS = \ - URL.o \ - MIMEType.o \ - SpoolRewindInputStream.o - -CPP_BINS = \ - crawl$(EXE) +SUBDIRS = logger libcrawler modules crawl -include $(TOPDIR)/makefiles/gmake/sub.mk local_all: -modules: $(DYNAMIC_LIB) - local_clean: - @-rm -f $(LOCAL_STATIC_LIB) local_distclean: local_install: - $(INSTALL) -d -m 0755 $(DESTDIR)$(bindir) - $(INSTALL) -m 0775 crawl$(EXE) $(DESTDIR)$(bindir) local_uninstall: - @-rm -f $(DESTDIR)$(bindir)/crawl - @-rmdir $(DESTDIR)$(bindir) local_test: - -run: - @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/logger:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser ./crawl diff --git a/src/MIMEType.cpp b/src/MIMEType.cpp deleted file mode 100644 index 25dc20c..0000000 --- a/src/MIMEType.cpp +++ /dev/null @@ -1,5 +0,0 @@ -#include "MIMEType.hpp" - -MIMEType MIMEType::Null; - - diff --git a/src/MIMEType.hpp b/src/MIMEType.hpp deleted file mode 100644 index 3a628ca..0000000 --- a/src/MIMEType.hpp +++ /dev/null @@ -1,100 +0,0 @@ -#ifndef __MIMETYPE_H -#define __MIMETYPE_H - -#include -#include -#include -#include - -class MIMEType { - protected: - std::string m_type; - std::string m_subtype; - - public: - MIMEType( ) - : m_type( "" ), m_subtype( "" ) - { - } - - MIMEType( const std::string _type, const std::string _subtype ) - : m_type( _type ), m_subtype( _subtype ) - { - } - - MIMEType( const MIMEType &m ) - : m_type( m.m_type ), m_subtype( m.m_subtype ) - { - } - - MIMEType( const char *s ) - { - const char *pos; - if( ( pos = strchr( s, '/' ) ) == NULL ) { - *this = Null; - } else { - m_type = std::string( s, 0, pos - s ); - m_subtype = std::string( s, pos - s + 1, strlen( s ) - ( pos - s + 1 ) ); - } - } - - MIMEType& operator=( const MIMEType &m ) - { - if( this != &m ) { - this->m_type = m.m_type; - this->m_subtype = m.m_subtype; - } - return *this; - } - - const std::string type( ) const - { - return m_type; - } - - const std::string subtype( ) const - { - return m_subtype; - } - - std::string str( ) const - { - std::ostringstream os; - os << *this; - return os.str( ); - } - - static MIMEType Null; - - bool operator!=( const MIMEType &other ) const - { - return( str( ) != other.str( ) ); - } - - bool operator==( const MIMEType &other ) const - { - return( str( ) == other.str( ) ); - } - - bool operator<( const MIMEType &other ) const - { - return( str( ) < other.str( ) ); - } - - template< typename CharT, typename TraitsT > friend - std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream&s, const MIMEType& m ); -}; - -template< typename CharT, typename TraitsT > -inline std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream< CharT, TraitsT > &s, const MIMEType &m ) -{ - if( m.type( ).empty( ) ) { - return s; - } - - s << m.type( ) << "/" << m.subtype( ); - - return s; -} - -#endif diff --git a/src/Makefile.W32 b/src/Makefile.W32 index ffcbe27..f3702cf 100755 --- a/src/Makefile.W32 +++ b/src/Makefile.W32 @@ -1,58 +1,12 @@ TOPDIR = .. -SUBDIRS = logger modules - -!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk - -INCLUDE_CXXFLAGS = \ - /D_WIN32_WINNT=0x504 - -INCLUDE_DIRS = \ - /I. \ - /I$(TOPDIR)\include\logger \ - /I$(TOPDIR)\include\module \ - /I$(TOPDIR)\include\util - -INCLUDE_LDFLAGS = \ - -INCLUDE_LIBS = \ - $(TOPDIR)\src\logger\logger.lib \ - Ws2_32.lib - -LOCAL_STATIC_LIB_OBJS = \ - win32\errormsg.obj \ - win32\stringutils.obj \ - URL.obj \ - MIMEType.obj \ - SpoolRewindInputStream.obj - -LOCAL_STATIC_LIB = \ - crawler.lib - -CPP_OBJS = \ - $(LOCAL_STATIC_LIB_OBJS) - -CPP_BINS = \ - crawl.exe +SUBDIRS = logger libcrawler modules crawl !INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk -all: dependencies $(CPP_OBJS) $(CPP_BINS) - -dependencies: - cd logger & $(MAKE) /nologo /f Makefile.w32 all - -crawl.exe: crawl.obj $(CPP_OBJS) - -$(LOCAL_STATIC_LIB): $(LOCAL_STATIC_LIB_OBJS) - $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $? - -local_all: $(LOCAL_STATIC_LIB) $(CPP_BINS) +local_all: local_clean: - @-erase $(LOCAL_STATIC_LIB) 2>NUL - @-erase $(CPP_OBJS) win32\*.obj 2>NUL - @-erase test.bat 2>NUL local_distclean: @@ -62,6 +16,7 @@ copy_prereq: @-copy "$(ICU_DIR)\bin\icuuc49.dll" . >NUL @-copy "$(ICU_DIR)\bin\icudt49.dll" . >NUL @-copy "$(TOPDIR)\src\logger\logger.dll" . >NUL + @-copy "$(TOPDIR)\src\libcrawler\crawler.dll" . >NUL run: copy_prereq - @-crawl.exe + @-crawl\crawl.exe diff --git a/src/Processor.hpp b/src/Processor.hpp deleted file mode 100644 index bc17ec0..0000000 --- a/src/Processor.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __PROCESSOR_H -#define __PROCESSOR_H - -#include "RewindInputStream.hpp" - -class Processor { - public: - virtual ~Processor( ) { } - - virtual void process( RewindInputStream *s ) = 0; -}; - -#endif diff --git a/src/RewindInputStream.hpp b/src/RewindInputStream.hpp deleted file mode 100755 index 39d7b6e..0000000 --- a/src/RewindInputStream.hpp +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __REWIND_INPUT_STREAM_H -#define __REWIND_INPUT_STREAM_H - -#include "URL.hpp" - -#include -#include - -class RewindInputStream : public std::istream { - public: - const URL getBaseUrl( ) const - { - return m_baseUrl; - } - - virtual void rewind( ) = 0; - - virtual std::string lastErrMsg( ) const = 0; - - protected: - - RewindInputStream( const URL &url ) - : std::istream( 0 ), m_baseUrl( url ) - { - } - - private: - URL m_baseUrl; -}; - -#endif diff --git a/src/SpoolRewindInputStream.cpp b/src/SpoolRewindInputStream.cpp deleted file mode 100644 index 9135741..0000000 --- a/src/SpoolRewindInputStream.cpp +++ /dev/null @@ -1,181 +0,0 @@ -#include "SpoolRewindInputStream.hpp" -#include "Logger.hpp" - -#include -#include -#include - -using namespace std; - -spool_streambuf::spool_streambuf( size_t bufSize, size_t putBack, size_t spoolBufSize ) - : m_putBack( max( putBack, size_t( 1 ) ) ), - m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ), - m_spoolBufSize( spoolBufSize ), m_state( TO_SPOOL_MEMORY ), - m_buf( max( bufSize, putBack ) + putBack ), - m_base( 0 ), m_start( 0 ) -{ - char *end = &m_buf.front( ) + m_buf.size( ); - setg( end, end, end ); -} - -spool_streambuf::~spool_streambuf( ) -{ - switch( m_state ) { - case TO_SPOOL_MEMORY: - case FROM_SPOOL_MEMORY: - // memory only, nothing to clean up - break; - - case TO_SPOOL_FILE: - case FROM_SPOOL_FILE: - m_spoolFile.close( ); - (void)remove( "/tmp/spool.tmp" ); - break; - } -} - -streambuf::int_type spool_streambuf::underflow( ) -{ - // check if buffer is exhausted, if not, return current character - if( gptr( ) < egptr( ) ) - return traits_type::to_int_type( *gptr( ) ); - - m_base = &m_buf.front( ); - m_start = m_base; - - // move put back away - if( eback( ) == m_base ) { - memmove( m_base, egptr( ) - m_putBack, m_putBack ); - m_start += m_putBack; - } - - // read from source or spool (depends on calling rewind) - streambuf::int_type n; - switch( m_state ) { - case TO_SPOOL_MEMORY: - case TO_SPOOL_FILE: - n = readFromSource( ); - if( n == 0 ) { - return traits_type::eof( ); - } else if( n < 0 ) { - // TODO handle error - return traits_type::eof( ); - } - - if( m_state == TO_SPOOL_MEMORY ) { - // as long we can "spool" to memory, do so.. - if( m_spoolBufPos + n <= m_spoolBufSize ) { - m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n ); - m_spoolBufPos += n; - } else { - // ..otherwise start spooling to disk, write - // current memory spool buffer first.. - LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBufSize << ")"; - m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc ); - assert( m_spoolFile.good( ) ); - m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize ); - assert( m_spoolFile.good( ) ); - m_state = TO_SPOOL_FILE; - m_spoolFile.write( m_start, n ); - assert( m_spoolFile.good( ) ); - } - } else { - // we are appending to the spool file - assert( m_spoolFile.good( ) ); - m_spoolFile.write( m_start, n ); - assert( m_spoolFile.good( ) ); - } - - break; - - case FROM_SPOOL_MEMORY: - n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos ); - if( n == 0 ) { - return traits_type::eof( ); - } - - copy( m_spoolBuf.begin( ) + m_spoolBufPos, - m_spoolBuf.begin( ) + m_spoolBufPos + n, - m_buf.begin( ) + ( m_start - m_base ) ); - - m_spoolBufPos += n; - - break; - - case FROM_SPOOL_FILE: - - n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos ); - m_spoolFile.read( m_start, n ); - m_spoolBufPos += n; - if( m_spoolBufPos > m_spoolBufSize ) { - return traits_type::eof( ); - } - if( n == 0 || m_spoolFile.eof( ) ) { - return traits_type::eof( ); - } - - break; - } - - // set pointers - setg( m_base, m_start, m_start + n ); - - return traits_type::to_int_type( *gptr( ) ); -} - -void spool_streambuf::rewind( ) -{ - switch( m_state ) { - case TO_SPOOL_MEMORY: - m_spoolBufPos = 0; - m_state = FROM_SPOOL_MEMORY; - break; - - case TO_SPOOL_FILE: - m_spoolFile.close( ); - m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in ); - m_spoolFile.seekg( 0, ios::end ); - m_spoolBufSize = m_spoolFile.tellg( ); - m_spoolFile.seekg( 0, ios::beg ); - m_spoolBufPos = 0; - m_state = FROM_SPOOL_FILE; - break; - - case FROM_SPOOL_MEMORY: - m_spoolBufPos = 0; - break; - - case FROM_SPOOL_FILE: - m_spoolBufPos = 0; - m_spoolFile.seekg( 0, ios::beg ); - break; - } - - char *end = &m_buf.front( ) + m_buf.size( ); - setg( end, end, end ); - pubseekpos( 0, ios_base::in ); -} - -SpoolRewindInputStream::SpoolRewindInputStream( const URL &url ) - : RewindInputStream( url ), m_buf( 0 ) -{ -} - -SpoolRewindInputStream::~SpoolRewindInputStream( ) -{ -} - -void SpoolRewindInputStream::rewind( ) -{ - // consume rest of web request, force spooling in streambuf - enum { CHUNKSIZE = 1024 }; - char buf[CHUNKSIZE]; - - while( good( ) && !eof( ) ) { - read( buf, CHUNKSIZE ); - } - - ios::clear( ); - assert( m_buf != 0 ); - m_buf->rewind( ); -} diff --git a/src/SpoolRewindInputStream.hpp b/src/SpoolRewindInputStream.hpp deleted file mode 100755 index aff593d..0000000 --- a/src/SpoolRewindInputStream.hpp +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __SPOOLREWINDINPUTSTREAM_H -#define __SPOOLREWINDINPUTSTREAM_H - -#include "RewindInputStream.hpp" - -#include -#include -#include - -class spool_streambuf : public std::streambuf -{ - public: - explicit spool_streambuf( size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 ); - - ~spool_streambuf( ); - - void rewind( ); - - protected: - virtual std::streambuf::int_type readFromSource( ) = 0; - - private: - int_type underflow( ); - - private: - const size_t m_putBack; - std::vector m_spoolBuf; - size_t m_spoolBufPos; - size_t m_spoolBufSize; - std::fstream m_spoolFile; - enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state; - - protected: - std::vector m_buf; - char *m_base; - char *m_start; -}; - -class SpoolRewindInputStream : public RewindInputStream -{ - public: - SpoolRewindInputStream( const URL &url ); - virtual ~SpoolRewindInputStream( ); - - virtual void rewind( ); - - protected: - spool_streambuf *m_buf; -}; - -#endif diff --git a/src/URL.cpp b/src/URL.cpp deleted file mode 100644 index f208500..0000000 --- a/src/URL.cpp +++ /dev/null @@ -1,5 +0,0 @@ -#include "URL.hpp" - -URL URL::Null; - - diff --git a/src/URL.hpp b/src/URL.hpp deleted file mode 100644 index 5cbd733..0000000 --- a/src/URL.hpp +++ /dev/null @@ -1,138 +0,0 @@ -#ifndef __URL_H -#define __URL_H - -#include -#include -#include - -using namespace std; - -class URL { - protected: - string m_protocol; - string m_host; - unsigned short m_port; - string m_path; - string m_query; - string m_fragment; - - public: - URL( ) - : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_query( "" ), m_fragment( "" ) - { - } - - URL( const URL& url ) - : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_query( url.m_query ), m_fragment( url.m_fragment ) - { - } - - URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _query, const std::string _fragment ) - : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_query( _query ), m_fragment( _fragment ) - { - } - - URL& operator=( const URL& u ) { - if( this != &u ) { - this->m_protocol = u.m_protocol; - this->m_port = u.m_port; - this->m_host = u.m_host; - this->m_path = u.m_path; - this->m_query = u.m_query; - this->m_fragment = u.m_fragment; - } - return *this; - } - - const string protocol( ) const - { - return m_protocol; - } - - const string host( ) const - { - return m_host; - } - - unsigned short port( ) const - { - return m_port; - } - - const string path( ) const - { - return m_path; - } - - const string query( ) const - { - return m_query; - } - - std::string fragment( ) const - { - return m_fragment; - } - - std::string str( ) const - { - std::ostringstream os; - os << *this; - return os.str( ); - } - - static URL Null; - - bool operator!=( const URL &other ) const - { - return( str( ) != other.str( ) ); - } - - bool operator==( const URL &other ) const - { - return( str( ) == other.str( ) ); - } - - bool operator<( const URL &other ) const - { - return( str( ) < other.str( ) ); - } - - template< typename CharT, typename TraitsT > friend - basic_ostream& operator<<( basic_ostream&s, const URL& u ); - - static unsigned short defaultPort( const std::string p ) - { - if( p == "http" ) return 80; - else if( p == "https" ) return 443; - else if( p == "ftp" ) return 21; - else return 0; - } -}; - -template< typename CharT, typename TraitsT > -inline basic_ostream& operator<<( basic_ostream&s, const URL& u ) { - if( u.protocol( ).empty( ) ) { - return s; - } - - s << u.protocol( ) << "://" << u.host( ); - - if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) { - s << ":" << u.port( ); - } - - s << u.path( ); - - if( !u.query( ).empty( ) ) { - s << "?" << u.query( ); - } - - if( !u.fragment( ).empty( ) ) { - s << "#" << u.fragment( ); - } - - return s; -} - -#endif diff --git a/src/URLFilter.hpp b/src/URLFilter.hpp deleted file mode 100644 index 2136009..0000000 --- a/src/URLFilter.hpp +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __URLFILTER_H -#define __URLFILTER_H - -#include "URL.hpp" - -class URLFilter -{ - public: - virtual ~URLFilter( ) { }; - - virtual bool filter( const URL url ) = 0; -}; - -#endif diff --git a/src/URLNormalizer.hpp b/src/URLNormalizer.hpp deleted file mode 100644 index af1781a..0000000 --- a/src/URLNormalizer.hpp +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __URLNORMALIZER_H -#define __URLNORMALIZER_H - -#include - -#include "URL.hpp" - -class URLNormalizer { - public: - virtual ~URLNormalizer( ) { }; - - virtual URL parseUrl( const std::string s ) = 0; - - virtual URL normalize( const URL url, const std::string s ) = 0; -}; - -#endif diff --git a/src/URLSeen.hpp b/src/URLSeen.hpp deleted file mode 100644 index 742c863..0000000 --- a/src/URLSeen.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __URLSEEN_H -#define __URLSEEN_H - -#include "URL.hpp" - -class URLSeen { - public: - virtual ~URLSeen( ) { }; - virtual bool seen( const URL url ) = 0; -}; - -#endif diff --git a/src/crawl.cpp b/src/crawl.cpp deleted file mode 100755 index 823ed02..0000000 --- a/src/crawl.cpp +++ /dev/null @@ -1,238 +0,0 @@ -#include "Fetcher.hpp" -#include "Frontier.hpp" -#include "Deduper.hpp" -#include "Processor.hpp" -#include "URLSeen.hpp" -#include "URLNormalizer.hpp" -#include "URLFilter.hpp" -#include "TypeDetect.hpp" - -#include "ModuleLoader.hpp" - -#include "Logger.hpp" - -#include -#include -#include - -#ifndef _WIN32 -#include -#else -#define WIN32_MEAN_AND_LEAN -#endif - -using namespace std; - -static bool term = false; - -#ifndef _WIN32 - -static void terminate_func( int sig ) -{ - (void)sig; - term = true; -} - -#else - -BOOL WINAPI termHandler( DWORD ctrlType ) -{ - switch( ctrlType ){ - case CTRL_C_EVENT: - case CTRL_BREAK_EVENT: - case CTRL_CLOSE_EVENT: - case CTRL_LOGOFF_EVENT: - case CTRL_SHUTDOWN_EVENT: - term = true; - return TRUE; - default: - return FALSE; - } -} - -#endif - -int main( void ) -{ - try { - Logger::instance( ).openConsoleLog( logINFO ); - -#ifndef _WIN32 - struct sigaction sa; - memset( &sa, 0, sizeof( struct sigaction ) ); - sa.sa_handler = terminate_func; - sa.sa_flags = SA_RESTART; - if( sigaction( SIGINT, &sa, NULL ) < 0 ) { - cerr << "Unable to install termianation signal handler" << endl; - } -#else - SetConsoleCtrlHandler( termHandler, TRUE ); -#endif - - LOG( logNOTICE ) << "Loading modules"; - - vector normalizerModules; -#ifndef _WIN32 - normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); - normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); -#else - normalizerModules.push_back( ".\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple.dll" ); - normalizerModules.push_back( ".\\modules\\urlnormalizer\\googleurl\\mod_urlnormalizer_googleurl.dll" ); -#endif - ModuleLoader urlNormalizers( normalizerModules ); - - vector filterModules; -#ifndef _WIN32 - filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); - filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); -#else - filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" ); - filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" ); -#endif - ModuleLoader ) > urlFilters( filterModules ); - - vector filterChainModules; -#ifndef _WIN32 - filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); -#else - filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" ); -#endif - ModuleLoader ) > urlChainFilter( filterChainModules ); - - vector frontierModules; -#ifndef _WIN32 - frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" ); -#else - frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" ); -#endif - ModuleLoader frontiers( frontierModules ); - - vector fetcherModules; -#ifndef _WIN32 - fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); -#else - fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" ); -#endif - ModuleLoader fetchers( fetcherModules ); - - vector urlseenModules; -#ifndef _WIN32 - urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" ); -#else - urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" ); -#endif - ModuleLoader urlSeens( urlseenModules ); - - vector deduperModules; -#ifndef _WIN32 - deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" ); -#else - deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" ); -#endif - ModuleLoader dedupers( deduperModules ); - - vector processorModules; -#ifndef _WIN32 - processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); -#else - processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); -#endif - ModuleLoader processors( processorModules ); - - vector typeDetectModules; -#ifndef _WIN32 - typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" ); -#endif - ModuleLoader typeDetectors( typeDetectModules ); - - Frontier *frontier = frontiers.create( "memory_frontier" ); -#ifndef _WIN32 - Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); -#else - Fetcher *fetcher = fetchers.create( "winhttp_fetcher" ); -#endif - Deduper *deduper = dedupers.create( "null_deduper" ); - URLSeen *urlSeen = urlSeens.create( "memory_urlseen" ); -#ifndef _WIN32 - TypeDetect *typeDetect = typeDetectors.create( "libmagic_typedetect" ); -#endif - - set protocols; - protocols.insert( "http" ); - protocols.insert( "https" ); - URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); - - set hosts; - hosts.insert( "www.andreasbaumann.cc" ); - URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); - - list filters; - filters.push_back( hostFilter ); - filters.push_back( protocolFilter ); - URLFilter *chainFilter = urlChainFilter.create( "chain_urlfilter", filters ); - - URLNormalizer *normalizer = urlNormalizers.create( "google_urlnormalizer" ); -// URLNormalizer *normalizer = urlNormalizers.create( "simple_urlnormalizer" ); - - Processor *htmlParser = processors.create( "htmllinkextract_processor", - normalizer, frontier, chainFilter, urlSeen ); - - LOG( logNOTICE ) << "Crawler started.."; - - frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); - - URL url; - while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { - LOG( logINFO ) << "Got URL " << url; - RewindInputStream *s = fetcher->fetch( url ); - if( !s->good( ) ) { - LOG( logERROR ) << "Fetching URL '" << url << "' failed!"; - continue; - } - - if( deduper->contentSeen( url, s ) ) { - LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen"; - delete s; - continue; - } - -#ifndef _WIN32 - MIMEType mimeType = typeDetect->detect( s ); - - if( mimeType != MIMEType::Null ) { - if( mimeType == "text/html" ) { - s->rewind( ); - htmlParser->process( s ); - } else if( mimeType == "application/x-gzip" ) { - s->rewind( ); - LOG( logINFO ) << "Storing archive " << url; - } - } -#else - htmlParser->process( s ); -#endif - - delete s; - } - - processors.destroy( htmlParser ); - urlNormalizers.destroy( normalizer ); - urlChainFilter.destroy( chainFilter ); - urlFilters.destroy( protocolFilter ); - urlFilters.destroy( hostFilter ); -#ifndef _WIN32 - typeDetectors.destroy( typeDetect ); -#endif - urlSeens.destroy( urlSeen ); - dedupers.destroy( deduper ); - fetchers.destroy( fetcher ); - frontiers.destroy( frontier ); - - LOG( logNOTICE ) << "Crawler stopped.. normal shutdown.."; - - return 0; - } catch( exception &e ) { - LOG( logFATAL ) << "Crawler stopped: " << e.what( ); - return 1; - } -} diff --git a/src/crawl/GNUmakefile b/src/crawl/GNUmakefile new file mode 100755 index 0000000..6899fde --- /dev/null +++ b/src/crawl/GNUmakefile @@ -0,0 +1,55 @@ +TOPDIR = ../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_CPPFLAGS = \ + +INCLUDE_DIRS = \ + -I. \ + -I$(TOPDIR)/include/logger \ + -I$(TOPDIR)/include/util \ + -I$(TOPDIR)/include/module + +INCLUDE_LDFLAGS = \ + -L$(TOPDIR)/src/logger + +INCLUDE_LIBS = \ + -llogger + +# openssl +ifeq ($(WITH_SSL),1) + +INCLUDE_CFLAGS += \ + -DWITH_SSL + +INCLUDE_LIBS += \ + $(OPENSSL_LIBS) +endif + +CPP_OBJS = \ + +CPP_BINS = \ + crawl$(EXE) + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + $(INSTALL) -d -m 0755 $(DESTDIR)$(bindir) + $(INSTALL) -m 0775 crawl$(EXE) $(DESTDIR)$(bindir) + +local_uninstall: + @-rm -f $(DESTDIR)$(bindir)/crawl + @-rmdir $(DESTDIR)$(bindir) + +local_test: + +run: + @LD_LIBRARY_PATH=$(TOPDIR)/src:$(TOPDIR)/src/logger:$(TOPDIR)/googleurl:$(TOPDIR)/libfetch:$(TOPDIR)/streamhtmlparser ./crawl diff --git a/src/crawl/Makefile.W32 b/src/crawl/Makefile.W32 new file mode 100755 index 0000000..74442dc --- /dev/null +++ b/src/crawl/Makefile.W32 @@ -0,0 +1,39 @@ +TOPDIR = ..\.. + +SUBDIRS = + +!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk + +INCLUDE_CXXFLAGS = \ + /D_WIN32_WINNT=0x504 \ + /DSHARED + +INCLUDE_DIRS = \ + /I. \ + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\module \ + /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\crawler + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)\src\logger\logger.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib + +CPP_OBJS = \ + +CPP_BINS = \ + crawl.exe + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +crawl.exe: crawl.obj + +local_all: $(CPP_BINS) + +local_clean: + +local_distclean: + +local_test: diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp new file mode 100755 index 0000000..823ed02 --- /dev/null +++ b/src/crawl/crawl.cpp @@ -0,0 +1,238 @@ +#include "Fetcher.hpp" +#include "Frontier.hpp" +#include "Deduper.hpp" +#include "Processor.hpp" +#include "URLSeen.hpp" +#include "URLNormalizer.hpp" +#include "URLFilter.hpp" +#include "TypeDetect.hpp" + +#include "ModuleLoader.hpp" + +#include "Logger.hpp" + +#include +#include +#include + +#ifndef _WIN32 +#include +#else +#define WIN32_MEAN_AND_LEAN +#endif + +using namespace std; + +static bool term = false; + +#ifndef _WIN32 + +static void terminate_func( int sig ) +{ + (void)sig; + term = true; +} + +#else + +BOOL WINAPI termHandler( DWORD ctrlType ) +{ + switch( ctrlType ){ + case CTRL_C_EVENT: + case CTRL_BREAK_EVENT: + case CTRL_CLOSE_EVENT: + case CTRL_LOGOFF_EVENT: + case CTRL_SHUTDOWN_EVENT: + term = true; + return TRUE; + default: + return FALSE; + } +} + +#endif + +int main( void ) +{ + try { + Logger::instance( ).openConsoleLog( logINFO ); + +#ifndef _WIN32 + struct sigaction sa; + memset( &sa, 0, sizeof( struct sigaction ) ); + sa.sa_handler = terminate_func; + sa.sa_flags = SA_RESTART; + if( sigaction( SIGINT, &sa, NULL ) < 0 ) { + cerr << "Unable to install termianation signal handler" << endl; + } +#else + SetConsoleCtrlHandler( termHandler, TRUE ); +#endif + + LOG( logNOTICE ) << "Loading modules"; + + vector normalizerModules; +#ifndef _WIN32 + normalizerModules.push_back( "./modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" ); + normalizerModules.push_back( "./modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" ); +#else + normalizerModules.push_back( ".\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple.dll" ); + normalizerModules.push_back( ".\\modules\\urlnormalizer\\googleurl\\mod_urlnormalizer_googleurl.dll" ); +#endif + ModuleLoader urlNormalizers( normalizerModules ); + + vector filterModules; +#ifndef _WIN32 + filterModules.push_back( "./modules/urlfilter/protocol/mod_urlfilter_protocol.so" ); + filterModules.push_back( "./modules/urlfilter/host/mod_urlfilter_host.so" ); +#else + filterModules.push_back( ".\\modules\\urlfilter\\protocol\\mod_urlfilter_protocol.dll" ); + filterModules.push_back( ".\\modules\\urlfilter\\host\\mod_urlfilter_host.dll" ); +#endif + ModuleLoader ) > urlFilters( filterModules ); + + vector filterChainModules; +#ifndef _WIN32 + filterChainModules.push_back( "./modules/urlfilter/chain/mod_urlfilter_chain.so" ); +#else + filterChainModules.push_back( ".\\modules\\urlfilter\\chain\\mod_urlfilter_chain.dll" ); +#endif + ModuleLoader ) > urlChainFilter( filterChainModules ); + + vector frontierModules; +#ifndef _WIN32 + frontierModules.push_back( "./modules/frontier/memory/mod_frontier_memory.so" ); +#else + frontierModules.push_back( ".\\modules\\frontier\\memory\\mod_frontier_memory.dll" ); +#endif + ModuleLoader frontiers( frontierModules ); + + vector fetcherModules; +#ifndef _WIN32 + fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); +#else + fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" ); +#endif + ModuleLoader fetchers( fetcherModules ); + + vector urlseenModules; +#ifndef _WIN32 + urlseenModules.push_back( "./modules/urlseen/memory/mod_urlseen_memory.so" ); +#else + urlseenModules.push_back( ".\\modules\\urlseen\\memory\\mod_urlseen_memory.dll" ); +#endif + ModuleLoader urlSeens( urlseenModules ); + + vector deduperModules; +#ifndef _WIN32 + deduperModules.push_back( "./modules/deduper/null/mod_deduper_null.so" ); +#else + deduperModules.push_back( ".\\modules\\deduper\\null\\mod_deduper_null.dll" ); +#endif + ModuleLoader dedupers( deduperModules ); + + vector processorModules; +#ifndef _WIN32 + processorModules.push_back( "./modules/processor/htmllinkextract/mod_processor_htmllinkextract.so" ); +#else + processorModules.push_back( ".\\modules\\processor\\htmllinkextract\\mod_processor_htmllinkextract.dll" ); +#endif + ModuleLoader processors( processorModules ); + + vector typeDetectModules; +#ifndef _WIN32 + typeDetectModules.push_back( "./modules/typedetect/libmagic/mod_typedetect_libmagic.so" ); +#endif + ModuleLoader typeDetectors( typeDetectModules ); + + Frontier *frontier = frontiers.create( "memory_frontier" ); +#ifndef _WIN32 + Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); +#else + Fetcher *fetcher = fetchers.create( "winhttp_fetcher" ); +#endif + Deduper *deduper = dedupers.create( "null_deduper" ); + URLSeen *urlSeen = urlSeens.create( "memory_urlseen" ); +#ifndef _WIN32 + TypeDetect *typeDetect = typeDetectors.create( "libmagic_typedetect" ); +#endif + + set protocols; + protocols.insert( "http" ); + protocols.insert( "https" ); + URLFilter *protocolFilter = urlFilters.create( "protocol_urlfilter", protocols ); + + set hosts; + hosts.insert( "www.andreasbaumann.cc" ); + URLFilter *hostFilter = urlFilters.create( "host_urlfilter", hosts ); + + list filters; + filters.push_back( hostFilter ); + filters.push_back( protocolFilter ); + URLFilter *chainFilter = urlChainFilter.create( "chain_urlfilter", filters ); + + URLNormalizer *normalizer = urlNormalizers.create( "google_urlnormalizer" ); +// URLNormalizer *normalizer = urlNormalizers.create( "simple_urlnormalizer" ); + + Processor *htmlParser = processors.create( "htmllinkextract_processor", + normalizer, frontier, chainFilter, urlSeen ); + + LOG( logNOTICE ) << "Crawler started.."; + + frontier->addUrl( normalizer->parseUrl( "http://www.andreasbaumann.cc" ) ); + + URL url; + while( !term && ( url = frontier->getNextUrl( ) ) != URL::Null ) { + LOG( logINFO ) << "Got URL " << url; + RewindInputStream *s = fetcher->fetch( url ); + if( !s->good( ) ) { + LOG( logERROR ) << "Fetching URL '" << url << "' failed!"; + continue; + } + + if( deduper->contentSeen( url, s ) ) { + LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen"; + delete s; + continue; + } + +#ifndef _WIN32 + MIMEType mimeType = typeDetect->detect( s ); + + if( mimeType != MIMEType::Null ) { + if( mimeType == "text/html" ) { + s->rewind( ); + htmlParser->process( s ); + } else if( mimeType == "application/x-gzip" ) { + s->rewind( ); + LOG( logINFO ) << "Storing archive " << url; + } + } +#else + htmlParser->process( s ); +#endif + + delete s; + } + + processors.destroy( htmlParser ); + urlNormalizers.destroy( normalizer ); + urlChainFilter.destroy( chainFilter ); + urlFilters.destroy( protocolFilter ); + urlFilters.destroy( hostFilter ); +#ifndef _WIN32 + typeDetectors.destroy( typeDetect ); +#endif + urlSeens.destroy( urlSeen ); + dedupers.destroy( deduper ); + fetchers.destroy( fetcher ); + frontiers.destroy( frontier ); + + LOG( logNOTICE ) << "Crawler stopped.. normal shutdown.."; + + return 0; + } catch( exception &e ) { + LOG( logFATAL ) << "Crawler stopped: " << e.what( ); + return 1; + } +} diff --git a/src/libcrawler/GNUmakefile b/src/libcrawler/GNUmakefile new file mode 100755 index 0000000..c1e7a7f --- /dev/null +++ b/src/libcrawler/GNUmakefile @@ -0,0 +1,42 @@ +TOPDIR = ../.. + +SUBDIRS = + +-include $(TOPDIR)/makefiles/gmake/platform.mk + +INCLUDE_CPPFLAGS = \ + +INCLUDE_LDFLAGS = \ + +INCLUDE_DIRS = \ + -I. \ + -I$(TOPDIR)/include/logger \ + -I$(TOPDIR)/include/util + +INCLUDE_LIBS = \ + +STATIC_LIB = libcrawler.a + +DYNAMIC_LIB = libcrawler.so +DYNAMIC_LIB_MAJOR = 0 +DYNAMIC_LIB_MINOR = 0 +DYNAMIC_LIB_PATCH = 0 + +CPP_OBJS = \ + URL.o \ + MIMEType.o \ + SpoolRewindInputStream.o + +-include $(TOPDIR)/makefiles/gmake/sub.mk + +local_all: + +local_clean: + +local_distclean: + +local_install: + +local_uninstall: + +local_test: diff --git a/src/libcrawler/MIMEType.cpp b/src/libcrawler/MIMEType.cpp new file mode 100644 index 0000000..25dc20c --- /dev/null +++ b/src/libcrawler/MIMEType.cpp @@ -0,0 +1,5 @@ +#include "MIMEType.hpp" + +MIMEType MIMEType::Null; + + diff --git a/src/libcrawler/Makefile.W32 b/src/libcrawler/Makefile.W32 new file mode 100755 index 0000000..ab18d2c --- /dev/null +++ b/src/libcrawler/Makefile.W32 @@ -0,0 +1,45 @@ +TOPDIR = ..\.. + +SUBDIRS = + +!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk + +INCLUDE_CXXFLAGS = \ + /D_WIN32_WINNT=0x504 \ + /DBUILDING_CRAWLER + +INCLUDE_DIRS = \ + /I. \ + /I$(TOPDIR)\include\crawler \ + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\util + +INCLUDE_LDFLAGS = \ + +INCLUDE_LIBS = \ + $(TOPDIR)\src\logger\logger.lib + +CPP_OBJS = \ + win32\errormsg.dllobj \ + win32\stringutils.dllobj \ + URL.dllobj \ + MIMEType.dllobj \ + SpoolRewindInputStream.dllobj + +DYNAMIC_LIB = \ + crawler.dll + +!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk + +$(DYNAMIC_LIB): $(CPP_OBJS) + $(LINK) /nologo /dll /out:$@ $(LDFLAGS) $(LIBS) $? + +local_all: $(DYNAMIC_LIB) + +local_clean: + @-erase $(DYNAMIC_LIB) 2>NUL + @-erase win32\*.obj 2>NUL + +local_distclean: + +local_test: diff --git a/src/libcrawler/SpoolRewindInputStream.cpp b/src/libcrawler/SpoolRewindInputStream.cpp new file mode 100644 index 0000000..9135741 --- /dev/null +++ b/src/libcrawler/SpoolRewindInputStream.cpp @@ -0,0 +1,181 @@ +#include "SpoolRewindInputStream.hpp" +#include "Logger.hpp" + +#include +#include +#include + +using namespace std; + +spool_streambuf::spool_streambuf( size_t bufSize, size_t putBack, size_t spoolBufSize ) + : m_putBack( max( putBack, size_t( 1 ) ) ), + m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ), + m_spoolBufSize( spoolBufSize ), m_state( TO_SPOOL_MEMORY ), + m_buf( max( bufSize, putBack ) + putBack ), + m_base( 0 ), m_start( 0 ) +{ + char *end = &m_buf.front( ) + m_buf.size( ); + setg( end, end, end ); +} + +spool_streambuf::~spool_streambuf( ) +{ + switch( m_state ) { + case TO_SPOOL_MEMORY: + case FROM_SPOOL_MEMORY: + // memory only, nothing to clean up + break; + + case TO_SPOOL_FILE: + case FROM_SPOOL_FILE: + m_spoolFile.close( ); + (void)remove( "/tmp/spool.tmp" ); + break; + } +} + +streambuf::int_type spool_streambuf::underflow( ) +{ + // check if buffer is exhausted, if not, return current character + if( gptr( ) < egptr( ) ) + return traits_type::to_int_type( *gptr( ) ); + + m_base = &m_buf.front( ); + m_start = m_base; + + // move put back away + if( eback( ) == m_base ) { + memmove( m_base, egptr( ) - m_putBack, m_putBack ); + m_start += m_putBack; + } + + // read from source or spool (depends on calling rewind) + streambuf::int_type n; + switch( m_state ) { + case TO_SPOOL_MEMORY: + case TO_SPOOL_FILE: + n = readFromSource( ); + if( n == 0 ) { + return traits_type::eof( ); + } else if( n < 0 ) { + // TODO handle error + return traits_type::eof( ); + } + + if( m_state == TO_SPOOL_MEMORY ) { + // as long we can "spool" to memory, do so.. + if( m_spoolBufPos + n <= m_spoolBufSize ) { + m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n ); + m_spoolBufPos += n; + } else { + // ..otherwise start spooling to disk, write + // current memory spool buffer first.. + LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBufSize << ")"; + m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc ); + assert( m_spoolFile.good( ) ); + m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize ); + assert( m_spoolFile.good( ) ); + m_state = TO_SPOOL_FILE; + m_spoolFile.write( m_start, n ); + assert( m_spoolFile.good( ) ); + } + } else { + // we are appending to the spool file + assert( m_spoolFile.good( ) ); + m_spoolFile.write( m_start, n ); + assert( m_spoolFile.good( ) ); + } + + break; + + case FROM_SPOOL_MEMORY: + n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos ); + if( n == 0 ) { + return traits_type::eof( ); + } + + copy( m_spoolBuf.begin( ) + m_spoolBufPos, + m_spoolBuf.begin( ) + m_spoolBufPos + n, + m_buf.begin( ) + ( m_start - m_base ) ); + + m_spoolBufPos += n; + + break; + + case FROM_SPOOL_FILE: + + n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos ); + m_spoolFile.read( m_start, n ); + m_spoolBufPos += n; + if( m_spoolBufPos > m_spoolBufSize ) { + return traits_type::eof( ); + } + if( n == 0 || m_spoolFile.eof( ) ) { + return traits_type::eof( ); + } + + break; + } + + // set pointers + setg( m_base, m_start, m_start + n ); + + return traits_type::to_int_type( *gptr( ) ); +} + +void spool_streambuf::rewind( ) +{ + switch( m_state ) { + case TO_SPOOL_MEMORY: + m_spoolBufPos = 0; + m_state = FROM_SPOOL_MEMORY; + break; + + case TO_SPOOL_FILE: + m_spoolFile.close( ); + m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in ); + m_spoolFile.seekg( 0, ios::end ); + m_spoolBufSize = m_spoolFile.tellg( ); + m_spoolFile.seekg( 0, ios::beg ); + m_spoolBufPos = 0; + m_state = FROM_SPOOL_FILE; + break; + + case FROM_SPOOL_MEMORY: + m_spoolBufPos = 0; + break; + + case FROM_SPOOL_FILE: + m_spoolBufPos = 0; + m_spoolFile.seekg( 0, ios::beg ); + break; + } + + char *end = &m_buf.front( ) + m_buf.size( ); + setg( end, end, end ); + pubseekpos( 0, ios_base::in ); +} + +SpoolRewindInputStream::SpoolRewindInputStream( const URL &url ) + : RewindInputStream( url ), m_buf( 0 ) +{ +} + +SpoolRewindInputStream::~SpoolRewindInputStream( ) +{ +} + +void SpoolRewindInputStream::rewind( ) +{ + // consume rest of web request, force spooling in streambuf + enum { CHUNKSIZE = 1024 }; + char buf[CHUNKSIZE]; + + while( good( ) && !eof( ) ) { + read( buf, CHUNKSIZE ); + } + + ios::clear( ); + assert( m_buf != 0 ); + m_buf->rewind( ); +} diff --git a/src/libcrawler/URL.cpp b/src/libcrawler/URL.cpp new file mode 100644 index 0000000..f208500 --- /dev/null +++ b/src/libcrawler/URL.cpp @@ -0,0 +1,5 @@ +#include "URL.hpp" + +URL URL::Null; + + diff --git a/src/libcrawler/win32/errormsg.cpp b/src/libcrawler/win32/errormsg.cpp new file mode 100755 index 0000000..c0a65d8 --- /dev/null +++ b/src/libcrawler/win32/errormsg.cpp @@ -0,0 +1,27 @@ +#include "win32/errormsg.hpp" + +using namespace std; + +#define WIN32_MEAN_AND_LEAN +#include + +string getLastError( ) +{ + LPTSTR buf; + DWORD size; + + DWORD lastErr = GetLastError( ); + + if( !FormatMessage( + FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS | + FORMAT_MESSAGE_MAX_WIDTH_MASK, + NULL, lastErr, 0, (LPTSTR)&buf, + 0, NULL ) ) { + return ""; + } + + return string( buf ); +} + diff --git a/src/libcrawler/win32/stringutils.cpp b/src/libcrawler/win32/stringutils.cpp new file mode 100755 index 0000000..607735c --- /dev/null +++ b/src/libcrawler/win32/stringutils.cpp @@ -0,0 +1,21 @@ +#include "win32/stringutils.hpp" + +using namespace std; + +#define WIN32_MEAN_AND_LEAN +#include + +std::wstring s2ws( const std::string &s ) +{ + // get size for buffer and allocate it + int len; + int slength = (int)s.length( )+1; + len = MultiByteToWideChar( CP_ACP, 0, s.c_str( ), slength, 0, 0 ); + wchar_t *buf = new wchar_t[len]; + + // convert + MultiByteToWideChar( CP_ACP, 0, s.c_str( ), slength, buf, len ); + std::wstring res( buf ); + delete[] buf; + return res; +} diff --git a/src/logger/Makefile.W32 b/src/logger/Makefile.W32 index 3f9352d..d5516ca 100755 --- a/src/logger/Makefile.W32 +++ b/src/logger/Makefile.W32 @@ -6,7 +6,7 @@ SUBDIRS = INCLUDE_CXXFLAGS = \ /D_WIN32_WINNT=0x504 \ - /DSHARED + /DSHARED /DBUILDING_LOGGER /DBUILDING_UTIL INCLUDE_DIRS = \ /I. \ diff --git a/src/modules/deduper/null/Makefile.W32 b/src/modules/deduper/null/Makefile.W32 index e01235f..e0d6c60 100755 --- a/src/modules/deduper/null/Makefile.W32 +++ b/src/modules/deduper/null/Makefile.W32 @@ -11,12 +11,21 @@ INCLUDE_DIRS = \ /I. \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ - /I$(TOPDIR)\include\util + /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\crawler + +copy_prereq: + @-copy "$(ICU_DIR)\bin\icuuc49.dll" . >NUL + @-copy "$(ICU_DIR)\bin\icudt49.dll" . >NUL + @-copy "$(TOPDIR)\src\logger\logger.dll" . >NUL + +run: copy_prereq + @-crawl.exe INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib + $(TOPDIR)\src\libcrawler\crawler.lib DYNAMIC_MODULE = \ mod_deduper_null.dll diff --git a/src/modules/fetcher/file/Makefile.W32 b/src/modules/fetcher/file/Makefile.W32 index 3203d6d..a77daa6 100755 --- a/src/modules/fetcher/file/Makefile.W32 +++ b/src/modules/fetcher/file/Makefile.W32 @@ -11,12 +11,13 @@ INCLUDE_DIRS = \ /I. \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ - /I$(TOPDIR)\include\util + /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\crawler INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib + $(TOPDIR)\src\libcrawler\crawler.lib DYNAMIC_MODULE = \ mod_fetcher_file.dll diff --git a/src/modules/fetcher/winhttp/Makefile.W32 b/src/modules/fetcher/winhttp/Makefile.W32 index b46aa88..4cedb9c 100755 --- a/src/modules/fetcher/winhttp/Makefile.W32 +++ b/src/modules/fetcher/winhttp/Makefile.W32 @@ -12,12 +12,13 @@ INCLUDE_DIRS = \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ /I$(TOPDIR)\include\util \ - /I$(TOPDIR)\include\logger + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\crawler INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib \ $(TOPDIR)\src\logger\logger.lib \ WinHttp.lib diff --git a/src/modules/frontier/memory/Makefile.W32 b/src/modules/frontier/memory/Makefile.W32 index b44d95f..b12e4ca 100755 --- a/src/modules/frontier/memory/Makefile.W32 +++ b/src/modules/frontier/memory/Makefile.W32 @@ -12,12 +12,13 @@ INCLUDE_DIRS = \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ /I$(TOPDIR)\include\util \ - /I$(TOPDIR)\include\logger + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\crawler INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib \ $(TOPDIR)\src\logger\logger.lib DYNAMIC_MODULE = \ diff --git a/src/modules/urlfilter/chain/Makefile.W32 b/src/modules/urlfilter/chain/Makefile.W32 index 5a766ab..d3ad373 100755 --- a/src/modules/urlfilter/chain/Makefile.W32 +++ b/src/modules/urlfilter/chain/Makefile.W32 @@ -12,12 +12,12 @@ INCLUDE_DIRS = \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ /I$(TOPDIR)\include\util \ - /I$(TOPDIR)\include\logger + /I$(TOPDIR)\include\crawler INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib + $(TOPDIR)\src\libcrawler\crawler.lib DYNAMIC_MODULE = \ mod_urlfilter_chain.dll diff --git a/src/modules/urlfilter/host/Makefile.W32 b/src/modules/urlfilter/host/Makefile.W32 index 3b99125..024dc67 100755 --- a/src/modules/urlfilter/host/Makefile.W32 +++ b/src/modules/urlfilter/host/Makefile.W32 @@ -12,12 +12,13 @@ INCLUDE_DIRS = \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ /I$(TOPDIR)\include\util \ - /I$(TOPDIR)\include\logger + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\crawler INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib \ $(TOPDIR)\src\logger\logger.lib DYNAMIC_MODULE = \ diff --git a/src/modules/urlfilter/protocol/Makefile.W32 b/src/modules/urlfilter/protocol/Makefile.W32 index 747d714..e859829 100755 --- a/src/modules/urlfilter/protocol/Makefile.W32 +++ b/src/modules/urlfilter/protocol/Makefile.W32 @@ -12,12 +12,13 @@ INCLUDE_DIRS = \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ /I$(TOPDIR)\include\util \ - /I$(TOPDIR)\include\logger + /I$(TOPDIR)\include\logger \ + /I$(TOPDIR)\include\crawler INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib \ $(TOPDIR)\src\logger\logger.lib DYNAMIC_MODULE = \ diff --git a/src/modules/urlnormalizer/googleurl/Makefile.W32 b/src/modules/urlnormalizer/googleurl/Makefile.W32 index 4cc09c1..a906404 100755 --- a/src/modules/urlnormalizer/googleurl/Makefile.W32 +++ b/src/modules/urlnormalizer/googleurl/Makefile.W32 @@ -12,13 +12,14 @@ INCLUDE_DIRS = \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\crawler \ /I$(TOPDIR)\googleurl INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ $(TOPDIR)\googleurl\googleurl.lib \ - $(TOPDIR)\src\crawler.lib \ + $(TOPDIR)\src\libcrawler\crawler.lib \ "$(ICU_DIR)\lib\icuuc.lib" DYNAMIC_MODULE = \ diff --git a/src/modules/urlnormalizer/simpleurl/Makefile.W32 b/src/modules/urlnormalizer/simpleurl/Makefile.W32 index 69e732a..2a26e9c 100755 --- a/src/modules/urlnormalizer/simpleurl/Makefile.W32 +++ b/src/modules/urlnormalizer/simpleurl/Makefile.W32 @@ -11,12 +11,13 @@ INCLUDE_DIRS = \ /I. \ /I$(TOPDIR)\src \ /I$(TOPDIR)\include\module \ - /I$(TOPDIR)\include\util + /I$(TOPDIR)\include\util \ + /I$(TOPDIR)\include\crawler INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib + $(TOPDIR)\src\libcrawler\crawler.lib DYNAMIC_MODULE = \ mod_urlnormalizer_simple.dll diff --git a/src/win32/errormsg.cpp b/src/win32/errormsg.cpp deleted file mode 100755 index 1b58ea3..0000000 --- a/src/win32/errormsg.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include "errormsg.hpp" - -using namespace std; - -#define WIN32_MEAN_AND_LEAN -#include - -string getLastError( ) -{ - LPTSTR buf; - DWORD size; - - DWORD lastErr = GetLastError( ); - - if( !FormatMessage( - FORMAT_MESSAGE_ALLOCATE_BUFFER | - FORMAT_MESSAGE_FROM_SYSTEM | - FORMAT_MESSAGE_IGNORE_INSERTS | - FORMAT_MESSAGE_MAX_WIDTH_MASK, - NULL, lastErr, 0, (LPTSTR)&buf, - 0, NULL ) ) { - return ""; - } - - return string( buf ); -} - diff --git a/src/win32/errormsg.hpp b/src/win32/errormsg.hpp deleted file mode 100755 index 443e27d..0000000 --- a/src/win32/errormsg.hpp +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __ERRORMSG_H -#define __ERRORMSG_H - -#include - -std::string getLastError( ); - -#endif diff --git a/src/win32/stringutils.cpp b/src/win32/stringutils.cpp deleted file mode 100755 index a82dd7a..0000000 --- a/src/win32/stringutils.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "errormsg.hpp" - -using namespace std; - -#define WIN32_MEAN_AND_LEAN -#include - -std::wstring s2ws( const std::string &s ) -{ - // get size for buffer and allocate it - int len; - int slength = (int)s.length( )+1; - len = MultiByteToWideChar( CP_ACP, 0, s.c_str( ), slength, 0, 0 ); - wchar_t *buf = new wchar_t[len]; - - // convert - MultiByteToWideChar( CP_ACP, 0, s.c_str( ), slength, buf, len ); - std::wstring res( buf ); - delete[] buf; - return res; -} diff --git a/src/win32/stringutils.hpp b/src/win32/stringutils.hpp deleted file mode 100755 index 6d4bd80..0000000 --- a/src/win32/stringutils.hpp +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __STRINGUTILS_H -#define __STRINGUTILS_H - -#include - -std::wstring s2ws( const std::string &s ); - -#endif diff --git a/tests/logger/Makefile.W32 b/tests/logger/Makefile.W32 index e499e28..783add0 100755 --- a/tests/logger/Makefile.W32 +++ b/tests/logger/Makefile.W32 @@ -5,7 +5,8 @@ SUBDIRS = !INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk INCLUDE_CXXFLAGS = \ - /D_WIN32_WINNT=0x504 + /D_WIN32_WINNT=0x504 \ + /DSHARED INCLUDE_DIRS = \ /I. \ @@ -16,7 +17,6 @@ INCLUDE_DIRS = \ INCLUDE_LDFLAGS = \ INCLUDE_LIBS = \ - $(TOPDIR)\src\crawler.lib \ $(TOPDIR)\src\logger\logger.lib \ Ws2_32.lib diff --git a/tests/utils/Makefile.W32 b/tests/utils/Makefile.W32 index 03ea7ae..ec6f9e7 100755 --- a/tests/utils/Makefile.W32 +++ b/tests/utils/Makefile.W32 @@ -5,9 +5,8 @@ SUBDIRS = !INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk INCLUDE_CXXFLAGS = \ - /D_WIN32_WINNT=0x504 \ - /DNODLL - + /D_WIN32_WINNT=0x504 + INCLUDE_DIRS = \ /I. \ /I$(TOPDIR)\src \ diff --git a/tests/utils/test3.cpp b/tests/utils/test3.cpp index 2a9a37f..a07365a 100755 --- a/tests/utils/test3.cpp +++ b/tests/utils/test3.cpp @@ -30,8 +30,6 @@ class Logger : public Singleton< Logger > } }; -DEFINE_SINGLETON( Logger ) - /* this works, and two loggers can coexist, but they have different type class DerivedLogger : public Logger { -- cgit v1.2.3-54-g00ecf