From 13fc9a7da5111f4ddba942d3c6b6b8654ce395d6 Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Thu, 6 Sep 2012 22:18:23 +0200 Subject: more splitting into libcrawl, crawl binary moved more public header to 'include' changed approach for dynamic linking on Windows --- include/crawler/CrawlerExportable.hpp | 26 ++++++ include/crawler/DNSResolver.hpp | 7 ++ include/crawler/Deduper.hpp | 15 ++++ include/crawler/Fetcher.hpp | 15 ++++ include/crawler/Frontier.hpp | 16 ++++ include/crawler/MIMEType.hpp | 100 +++++++++++++++++++++ include/crawler/Processor.hpp | 13 +++ include/crawler/RewindInputStream.hpp | 32 +++++++ include/crawler/SpoolRewindInputStream.hpp | 51 +++++++++++ include/crawler/URL.hpp | 140 +++++++++++++++++++++++++++++ include/crawler/URLFilter.hpp | 14 +++ include/crawler/URLNormalizer.hpp | 17 ++++ include/crawler/URLSeen.hpp | 12 +++ include/crawler/win32/errormsg.hpp | 10 +++ include/crawler/win32/stringutils.hpp | 10 +++ 15 files changed, 478 insertions(+) create mode 100755 include/crawler/CrawlerExportable.hpp create mode 100644 include/crawler/DNSResolver.hpp create mode 100644 include/crawler/Deduper.hpp create mode 100755 include/crawler/Fetcher.hpp create mode 100644 include/crawler/Frontier.hpp create mode 100644 include/crawler/MIMEType.hpp create mode 100644 include/crawler/Processor.hpp create mode 100755 include/crawler/RewindInputStream.hpp create mode 100755 include/crawler/SpoolRewindInputStream.hpp create mode 100755 include/crawler/URL.hpp create mode 100644 include/crawler/URLFilter.hpp create mode 100644 include/crawler/URLNormalizer.hpp create mode 100644 include/crawler/URLSeen.hpp create mode 100755 include/crawler/win32/errormsg.hpp create mode 100755 include/crawler/win32/stringutils.hpp (limited to 'include/crawler') diff --git a/include/crawler/CrawlerExportable.hpp b/include/crawler/CrawlerExportable.hpp new file mode 100755 index 0000000..5b89108 --- /dev/null +++ b/include/crawler/CrawlerExportable.hpp @@ -0,0 +1,26 @@ +#ifndef __CRAWLER_EXPORTABLE_H +#define __CRAWLER_EXPORTABLE_H + +#ifndef _WIN32 + +#define CRAWLER_DLL_VISIBLE + +#else + +#ifdef SHARED + +#ifdef BUILDING_CRAWLER +#define CRAWLER_DLL_VISIBLE __declspec(dllexport) +#else +#define CRAWLER_DLL_VISIBLE __declspec(dllimport) +#endif + +#else + +#define CRAWLER_DLL_VISIBLE + +#endif // BUILDING_CRAWLER + +#endif // _WIN32 + +#endif diff --git a/include/crawler/DNSResolver.hpp b/include/crawler/DNSResolver.hpp new file mode 100644 index 0000000..8f79734 --- /dev/null +++ b/include/crawler/DNSResolver.hpp @@ -0,0 +1,7 @@ +#ifndef __DNSRESOLVER_H +#define __DNSRESOLVER_H + +class DNSResolver { +}; + +#endif diff --git a/include/crawler/Deduper.hpp b/include/crawler/Deduper.hpp new file mode 100644 index 0000000..3cb33c1 --- /dev/null +++ b/include/crawler/Deduper.hpp @@ -0,0 +1,15 @@ +#ifndef __DEDUPER_H +#define __DEDUPER_H + +#include "URL.hpp" +#include "RewindInputStream.hpp" + +class Deduper +{ + public: + virtual ~Deduper( ) { }; + + virtual bool contentSeen( const URL url, RewindInputStream *s ) = 0; +}; + +#endif diff --git a/include/crawler/Fetcher.hpp b/include/crawler/Fetcher.hpp new file mode 100755 index 0000000..40f1c7a --- /dev/null +++ b/include/crawler/Fetcher.hpp @@ -0,0 +1,15 @@ +#ifndef __FETCHER_H +#define __FETCHER_H + +#include "URL.hpp" +#include "RewindInputStream.hpp" + +class Fetcher +{ + public: + virtual ~Fetcher( ) { }; + + virtual RewindInputStream *fetch( const URL url ) = 0; +}; + +#endif diff --git a/include/crawler/Frontier.hpp b/include/crawler/Frontier.hpp new file mode 100644 index 0000000..54c0dd6 --- /dev/null +++ b/include/crawler/Frontier.hpp @@ -0,0 +1,16 @@ +#ifndef __FRONTIER_H +#define __FRONTIER_H + +#include "URL.hpp" + +class Frontier +{ + public: + virtual ~Frontier( ) { }; + + virtual URL getNextUrl( ) = 0; + + virtual void addUrl( const URL url ) = 0; +}; + +#endif diff --git a/include/crawler/MIMEType.hpp b/include/crawler/MIMEType.hpp new file mode 100644 index 0000000..3a628ca --- /dev/null +++ b/include/crawler/MIMEType.hpp @@ -0,0 +1,100 @@ +#ifndef __MIMETYPE_H +#define __MIMETYPE_H + +#include +#include +#include +#include + +class MIMEType { + protected: + std::string m_type; + std::string m_subtype; + + public: + MIMEType( ) + : m_type( "" ), m_subtype( "" ) + { + } + + MIMEType( const std::string _type, const std::string _subtype ) + : m_type( _type ), m_subtype( _subtype ) + { + } + + MIMEType( const MIMEType &m ) + : m_type( m.m_type ), m_subtype( m.m_subtype ) + { + } + + MIMEType( const char *s ) + { + const char *pos; + if( ( pos = strchr( s, '/' ) ) == NULL ) { + *this = Null; + } else { + m_type = std::string( s, 0, pos - s ); + m_subtype = std::string( s, pos - s + 1, strlen( s ) - ( pos - s + 1 ) ); + } + } + + MIMEType& operator=( const MIMEType &m ) + { + if( this != &m ) { + this->m_type = m.m_type; + this->m_subtype = m.m_subtype; + } + return *this; + } + + const std::string type( ) const + { + return m_type; + } + + const std::string subtype( ) const + { + return m_subtype; + } + + std::string str( ) const + { + std::ostringstream os; + os << *this; + return os.str( ); + } + + static MIMEType Null; + + bool operator!=( const MIMEType &other ) const + { + return( str( ) != other.str( ) ); + } + + bool operator==( const MIMEType &other ) const + { + return( str( ) == other.str( ) ); + } + + bool operator<( const MIMEType &other ) const + { + return( str( ) < other.str( ) ); + } + + template< typename CharT, typename TraitsT > friend + std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream&s, const MIMEType& m ); +}; + +template< typename CharT, typename TraitsT > +inline std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream< CharT, TraitsT > &s, const MIMEType &m ) +{ + if( m.type( ).empty( ) ) { + return s; + } + + s << m.type( ) << "/" << m.subtype( ); + + return s; +} + +#endif diff --git a/include/crawler/Processor.hpp b/include/crawler/Processor.hpp new file mode 100644 index 0000000..bc17ec0 --- /dev/null +++ b/include/crawler/Processor.hpp @@ -0,0 +1,13 @@ +#ifndef __PROCESSOR_H +#define __PROCESSOR_H + +#include "RewindInputStream.hpp" + +class Processor { + public: + virtual ~Processor( ) { } + + virtual void process( RewindInputStream *s ) = 0; +}; + +#endif diff --git a/include/crawler/RewindInputStream.hpp b/include/crawler/RewindInputStream.hpp new file mode 100755 index 0000000..6bbe80c --- /dev/null +++ b/include/crawler/RewindInputStream.hpp @@ -0,0 +1,32 @@ +#ifndef __REWIND_INPUT_STREAM_H +#define __REWIND_INPUT_STREAM_H + +#include "CrawlerExportable.hpp" +#include "URL.hpp" + +#include +#include + +class RewindInputStream : public std::istream { + public: + const URL getBaseUrl( ) const + { + return m_baseUrl; + } + + CRAWLER_DLL_VISIBLE virtual void rewind( ) = 0; + + virtual std::string lastErrMsg( ) const = 0; + + protected: + + RewindInputStream( const URL &url ) + : std::istream( 0 ), m_baseUrl( url ) + { + } + + private: + URL m_baseUrl; +}; + +#endif diff --git a/include/crawler/SpoolRewindInputStream.hpp b/include/crawler/SpoolRewindInputStream.hpp new file mode 100755 index 0000000..f065271 --- /dev/null +++ b/include/crawler/SpoolRewindInputStream.hpp @@ -0,0 +1,51 @@ +#ifndef __SPOOLREWINDINPUTSTREAM_H +#define __SPOOLREWINDINPUTSTREAM_H + +#include "RewindInputStream.hpp" + +#include +#include +#include + +class spool_streambuf : public std::streambuf +{ + public: + explicit CRAWLER_DLL_VISIBLE spool_streambuf( size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 ); + + CRAWLER_DLL_VISIBLE ~spool_streambuf( ); + + CRAWLER_DLL_VISIBLE void rewind( ); + + protected: + CRAWLER_DLL_VISIBLE virtual std::streambuf::int_type readFromSource( ) = 0; + + private: + CRAWLER_DLL_VISIBLE int_type underflow( ); + + private: + const size_t m_putBack; + std::vector m_spoolBuf; + size_t m_spoolBufPos; + size_t m_spoolBufSize; + std::fstream m_spoolFile; + enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state; + + protected: + std::vector m_buf; + char *m_base; + char *m_start; +}; + +class SpoolRewindInputStream : public RewindInputStream +{ + public: + CRAWLER_DLL_VISIBLE SpoolRewindInputStream( const URL &url ); + CRAWLER_DLL_VISIBLE virtual ~SpoolRewindInputStream( ); + + CRAWLER_DLL_VISIBLE virtual void rewind( ); + + protected: + spool_streambuf *m_buf; +}; + +#endif diff --git a/include/crawler/URL.hpp b/include/crawler/URL.hpp new file mode 100755 index 0000000..255a2db --- /dev/null +++ b/include/crawler/URL.hpp @@ -0,0 +1,140 @@ +#ifndef __URL_H +#define __URL_H + +#include "CrawlerExportable.hpp" + +#include +#include +#include + +using namespace std; + +class URL { + protected: + string m_protocol; + string m_host; + unsigned short m_port; + string m_path; + string m_query; + string m_fragment; + + public: + URL( ) + : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_query( "" ), m_fragment( "" ) + { + } + + URL( const URL& url ) + : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_query( url.m_query ), m_fragment( url.m_fragment ) + { + } + + URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _query, const std::string _fragment ) + : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_query( _query ), m_fragment( _fragment ) + { + } + + URL& operator=( const URL& u ) { + if( this != &u ) { + this->m_protocol = u.m_protocol; + this->m_port = u.m_port; + this->m_host = u.m_host; + this->m_path = u.m_path; + this->m_query = u.m_query; + this->m_fragment = u.m_fragment; + } + return *this; + } + + const string protocol( ) const + { + return m_protocol; + } + + const string host( ) const + { + return m_host; + } + + unsigned short port( ) const + { + return m_port; + } + + const string path( ) const + { + return m_path; + } + + const string query( ) const + { + return m_query; + } + + std::string fragment( ) const + { + return m_fragment; + } + + std::string str( ) const + { + std::ostringstream os; + os << *this; + return os.str( ); + } + + static URL CRAWLER_DLL_VISIBLE Null; + + bool operator!=( const URL &other ) const + { + return( str( ) != other.str( ) ); + } + + bool operator==( const URL &other ) const + { + return( str( ) == other.str( ) ); + } + + bool operator<( const URL &other ) const + { + return( str( ) < other.str( ) ); + } + + template< typename CharT, typename TraitsT > friend + basic_ostream& operator<<( basic_ostream&s, const URL& u ); + + static unsigned short defaultPort( const std::string p ) + { + if( p == "http" ) return 80; + else if( p == "https" ) return 443; + else if( p == "ftp" ) return 21; + else return 0; + } +}; + +template< typename CharT, typename TraitsT > +inline basic_ostream& operator<<( basic_ostream&s, const URL& u ) { + if( u.protocol( ).empty( ) ) { + return s; + } + + s << u.protocol( ) << "://" << u.host( ); + + if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) { + s << ":" << u.port( ); + } + + s << u.path( ); + + if( !u.query( ).empty( ) ) { + s << "?" << u.query( ); + } + + if( !u.fragment( ).empty( ) ) { + s << "#" << u.fragment( ); + } + + return s; +} + +#endif diff --git a/include/crawler/URLFilter.hpp b/include/crawler/URLFilter.hpp new file mode 100644 index 0000000..2136009 --- /dev/null +++ b/include/crawler/URLFilter.hpp @@ -0,0 +1,14 @@ +#ifndef __URLFILTER_H +#define __URLFILTER_H + +#include "URL.hpp" + +class URLFilter +{ + public: + virtual ~URLFilter( ) { }; + + virtual bool filter( const URL url ) = 0; +}; + +#endif diff --git a/include/crawler/URLNormalizer.hpp b/include/crawler/URLNormalizer.hpp new file mode 100644 index 0000000..af1781a --- /dev/null +++ b/include/crawler/URLNormalizer.hpp @@ -0,0 +1,17 @@ +#ifndef __URLNORMALIZER_H +#define __URLNORMALIZER_H + +#include + +#include "URL.hpp" + +class URLNormalizer { + public: + virtual ~URLNormalizer( ) { }; + + virtual URL parseUrl( const std::string s ) = 0; + + virtual URL normalize( const URL url, const std::string s ) = 0; +}; + +#endif diff --git a/include/crawler/URLSeen.hpp b/include/crawler/URLSeen.hpp new file mode 100644 index 0000000..742c863 --- /dev/null +++ b/include/crawler/URLSeen.hpp @@ -0,0 +1,12 @@ +#ifndef __URLSEEN_H +#define __URLSEEN_H + +#include "URL.hpp" + +class URLSeen { + public: + virtual ~URLSeen( ) { }; + virtual bool seen( const URL url ) = 0; +}; + +#endif diff --git a/include/crawler/win32/errormsg.hpp b/include/crawler/win32/errormsg.hpp new file mode 100755 index 0000000..f1ceb93 --- /dev/null +++ b/include/crawler/win32/errormsg.hpp @@ -0,0 +1,10 @@ +#ifndef __ERRORMSG_H +#define __ERRORMSG_H + +#include + +#include "CrawlerExportable.hpp" + +CRAWLER_DLL_VISIBLE std::string getLastError( ); + +#endif diff --git a/include/crawler/win32/stringutils.hpp b/include/crawler/win32/stringutils.hpp new file mode 100755 index 0000000..b0404cd --- /dev/null +++ b/include/crawler/win32/stringutils.hpp @@ -0,0 +1,10 @@ +#ifndef __STRINGUTILS_H +#define __STRINGUTILS_H + +#include + +#include "CrawlerExportable.hpp" + +CRAWLER_DLL_VISIBLE std::wstring s2ws( const std::string &s ); + +#endif -- cgit v1.2.3-54-g00ecf