summaryrefslogtreecommitdiff
path: root/include/crawler
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-09-06 22:18:23 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-09-06 22:18:23 +0200
commit13fc9a7da5111f4ddba942d3c6b6b8654ce395d6 (patch)
treee86210e3d939911e35f930a6dc73c3ebb591243b /include/crawler
parentf5c586f7231f7e033c5528bcefea357e4e64441c (diff)
downloadcrawler-13fc9a7da5111f4ddba942d3c6b6b8654ce395d6.tar.gz
crawler-13fc9a7da5111f4ddba942d3c6b6b8654ce395d6.tar.bz2
more splitting into libcrawl, crawl binary
moved more public header to 'include' changed approach for dynamic linking on Windows
Diffstat (limited to 'include/crawler')
-rwxr-xr-xinclude/crawler/CrawlerExportable.hpp26
-rw-r--r--include/crawler/DNSResolver.hpp7
-rw-r--r--include/crawler/Deduper.hpp15
-rwxr-xr-xinclude/crawler/Fetcher.hpp15
-rw-r--r--include/crawler/Frontier.hpp16
-rw-r--r--include/crawler/MIMEType.hpp100
-rw-r--r--include/crawler/Processor.hpp13
-rwxr-xr-xinclude/crawler/RewindInputStream.hpp32
-rwxr-xr-xinclude/crawler/SpoolRewindInputStream.hpp51
-rwxr-xr-xinclude/crawler/URL.hpp140
-rw-r--r--include/crawler/URLFilter.hpp14
-rw-r--r--include/crawler/URLNormalizer.hpp17
-rw-r--r--include/crawler/URLSeen.hpp12
-rwxr-xr-xinclude/crawler/win32/errormsg.hpp10
-rwxr-xr-xinclude/crawler/win32/stringutils.hpp10
15 files changed, 478 insertions, 0 deletions
diff --git a/include/crawler/CrawlerExportable.hpp b/include/crawler/CrawlerExportable.hpp
new file mode 100755
index 0000000..5b89108
--- /dev/null
+++ b/include/crawler/CrawlerExportable.hpp
@@ -0,0 +1,26 @@
+#ifndef __CRAWLER_EXPORTABLE_H
+#define __CRAWLER_EXPORTABLE_H
+
+#ifndef _WIN32
+
+#define CRAWLER_DLL_VISIBLE
+
+#else
+
+#ifdef SHARED
+
+#ifdef BUILDING_CRAWLER
+#define CRAWLER_DLL_VISIBLE __declspec(dllexport)
+#else
+#define CRAWLER_DLL_VISIBLE __declspec(dllimport)
+#endif
+
+#else
+
+#define CRAWLER_DLL_VISIBLE
+
+#endif // BUILDING_CRAWLER
+
+#endif // _WIN32
+
+#endif
diff --git a/include/crawler/DNSResolver.hpp b/include/crawler/DNSResolver.hpp
new file mode 100644
index 0000000..8f79734
--- /dev/null
+++ b/include/crawler/DNSResolver.hpp
@@ -0,0 +1,7 @@
+#ifndef __DNSRESOLVER_H
+#define __DNSRESOLVER_H
+
+class DNSResolver {
+};
+
+#endif
diff --git a/include/crawler/Deduper.hpp b/include/crawler/Deduper.hpp
new file mode 100644
index 0000000..3cb33c1
--- /dev/null
+++ b/include/crawler/Deduper.hpp
@@ -0,0 +1,15 @@
+#ifndef __DEDUPER_H
+#define __DEDUPER_H
+
+#include "URL.hpp"
+#include "RewindInputStream.hpp"
+
+class Deduper
+{
+ public:
+ virtual ~Deduper( ) { };
+
+ virtual bool contentSeen( const URL url, RewindInputStream *s ) = 0;
+};
+
+#endif
diff --git a/include/crawler/Fetcher.hpp b/include/crawler/Fetcher.hpp
new file mode 100755
index 0000000..40f1c7a
--- /dev/null
+++ b/include/crawler/Fetcher.hpp
@@ -0,0 +1,15 @@
+#ifndef __FETCHER_H
+#define __FETCHER_H
+
+#include "URL.hpp"
+#include "RewindInputStream.hpp"
+
+class Fetcher
+{
+ public:
+ virtual ~Fetcher( ) { };
+
+ virtual RewindInputStream *fetch( const URL url ) = 0;
+};
+
+#endif
diff --git a/include/crawler/Frontier.hpp b/include/crawler/Frontier.hpp
new file mode 100644
index 0000000..54c0dd6
--- /dev/null
+++ b/include/crawler/Frontier.hpp
@@ -0,0 +1,16 @@
+#ifndef __FRONTIER_H
+#define __FRONTIER_H
+
+#include "URL.hpp"
+
+class Frontier
+{
+ public:
+ virtual ~Frontier( ) { };
+
+ virtual URL getNextUrl( ) = 0;
+
+ virtual void addUrl( const URL url ) = 0;
+};
+
+#endif
diff --git a/include/crawler/MIMEType.hpp b/include/crawler/MIMEType.hpp
new file mode 100644
index 0000000..3a628ca
--- /dev/null
+++ b/include/crawler/MIMEType.hpp
@@ -0,0 +1,100 @@
+#ifndef __MIMETYPE_H
+#define __MIMETYPE_H
+
+#include <string>
+#include <cstring>
+#include <iostream>
+#include <sstream>
+
+class MIMEType {
+ protected:
+ std::string m_type;
+ std::string m_subtype;
+
+ public:
+ MIMEType( )
+ : m_type( "" ), m_subtype( "" )
+ {
+ }
+
+ MIMEType( const std::string _type, const std::string _subtype )
+ : m_type( _type ), m_subtype( _subtype )
+ {
+ }
+
+ MIMEType( const MIMEType &m )
+ : m_type( m.m_type ), m_subtype( m.m_subtype )
+ {
+ }
+
+ MIMEType( const char *s )
+ {
+ const char *pos;
+ if( ( pos = strchr( s, '/' ) ) == NULL ) {
+ *this = Null;
+ } else {
+ m_type = std::string( s, 0, pos - s );
+ m_subtype = std::string( s, pos - s + 1, strlen( s ) - ( pos - s + 1 ) );
+ }
+ }
+
+ MIMEType& operator=( const MIMEType &m )
+ {
+ if( this != &m ) {
+ this->m_type = m.m_type;
+ this->m_subtype = m.m_subtype;
+ }
+ return *this;
+ }
+
+ const std::string type( ) const
+ {
+ return m_type;
+ }
+
+ const std::string subtype( ) const
+ {
+ return m_subtype;
+ }
+
+ std::string str( ) const
+ {
+ std::ostringstream os;
+ os << *this;
+ return os.str( );
+ }
+
+ static MIMEType Null;
+
+ bool operator!=( const MIMEType &other ) const
+ {
+ return( str( ) != other.str( ) );
+ }
+
+ bool operator==( const MIMEType &other ) const
+ {
+ return( str( ) == other.str( ) );
+ }
+
+ bool operator<( const MIMEType &other ) const
+ {
+ return( str( ) < other.str( ) );
+ }
+
+ template< typename CharT, typename TraitsT > friend
+ std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream<CharT, TraitsT>&s, const MIMEType& m );
+};
+
+template< typename CharT, typename TraitsT >
+inline std::basic_ostream< CharT, TraitsT >& operator<<( std::basic_ostream< CharT, TraitsT > &s, const MIMEType &m )
+{
+ if( m.type( ).empty( ) ) {
+ return s;
+ }
+
+ s << m.type( ) << "/" << m.subtype( );
+
+ return s;
+}
+
+#endif
diff --git a/include/crawler/Processor.hpp b/include/crawler/Processor.hpp
new file mode 100644
index 0000000..bc17ec0
--- /dev/null
+++ b/include/crawler/Processor.hpp
@@ -0,0 +1,13 @@
+#ifndef __PROCESSOR_H
+#define __PROCESSOR_H
+
+#include "RewindInputStream.hpp"
+
+class Processor {
+ public:
+ virtual ~Processor( ) { }
+
+ virtual void process( RewindInputStream *s ) = 0;
+};
+
+#endif
diff --git a/include/crawler/RewindInputStream.hpp b/include/crawler/RewindInputStream.hpp
new file mode 100755
index 0000000..6bbe80c
--- /dev/null
+++ b/include/crawler/RewindInputStream.hpp
@@ -0,0 +1,32 @@
+#ifndef __REWIND_INPUT_STREAM_H
+#define __REWIND_INPUT_STREAM_H
+
+#include "CrawlerExportable.hpp"
+#include "URL.hpp"
+
+#include <iostream>
+#include <string>
+
+class RewindInputStream : public std::istream {
+ public:
+ const URL getBaseUrl( ) const
+ {
+ return m_baseUrl;
+ }
+
+ CRAWLER_DLL_VISIBLE virtual void rewind( ) = 0;
+
+ virtual std::string lastErrMsg( ) const = 0;
+
+ protected:
+
+ RewindInputStream( const URL &url )
+ : std::istream( 0 ), m_baseUrl( url )
+ {
+ }
+
+ private:
+ URL m_baseUrl;
+};
+
+#endif
diff --git a/include/crawler/SpoolRewindInputStream.hpp b/include/crawler/SpoolRewindInputStream.hpp
new file mode 100755
index 0000000..f065271
--- /dev/null
+++ b/include/crawler/SpoolRewindInputStream.hpp
@@ -0,0 +1,51 @@
+#ifndef __SPOOLREWINDINPUTSTREAM_H
+#define __SPOOLREWINDINPUTSTREAM_H
+
+#include "RewindInputStream.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+class spool_streambuf : public std::streambuf
+{
+ public:
+ explicit CRAWLER_DLL_VISIBLE spool_streambuf( size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 );
+
+ CRAWLER_DLL_VISIBLE ~spool_streambuf( );
+
+ CRAWLER_DLL_VISIBLE void rewind( );
+
+ protected:
+ CRAWLER_DLL_VISIBLE virtual std::streambuf::int_type readFromSource( ) = 0;
+
+ private:
+ CRAWLER_DLL_VISIBLE int_type underflow( );
+
+ private:
+ const size_t m_putBack;
+ std::vector<char> m_spoolBuf;
+ size_t m_spoolBufPos;
+ size_t m_spoolBufSize;
+ std::fstream m_spoolFile;
+ enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state;
+
+ protected:
+ std::vector<char> m_buf;
+ char *m_base;
+ char *m_start;
+};
+
+class SpoolRewindInputStream : public RewindInputStream
+{
+ public:
+ CRAWLER_DLL_VISIBLE SpoolRewindInputStream( const URL &url );
+ CRAWLER_DLL_VISIBLE virtual ~SpoolRewindInputStream( );
+
+ CRAWLER_DLL_VISIBLE virtual void rewind( );
+
+ protected:
+ spool_streambuf *m_buf;
+};
+
+#endif
diff --git a/include/crawler/URL.hpp b/include/crawler/URL.hpp
new file mode 100755
index 0000000..255a2db
--- /dev/null
+++ b/include/crawler/URL.hpp
@@ -0,0 +1,140 @@
+#ifndef __URL_H
+#define __URL_H
+
+#include "CrawlerExportable.hpp"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+
+using namespace std;
+
+class URL {
+ protected:
+ string m_protocol;
+ string m_host;
+ unsigned short m_port;
+ string m_path;
+ string m_query;
+ string m_fragment;
+
+ public:
+ URL( )
+ : m_protocol( "" ), m_host( "" ), m_port( 0 ), m_path( "" ), m_query( "" ), m_fragment( "" )
+ {
+ }
+
+ URL( const URL& url )
+ : m_protocol( url.m_protocol ), m_host( url.m_host ), m_port( url.m_port ), m_path( url.m_path ), m_query( url.m_query ), m_fragment( url.m_fragment )
+ {
+ }
+
+ URL( const std::string _protocol, const std::string _host, const unsigned short _port, const std::string _path, const std::string _query, const std::string _fragment )
+ : m_protocol( _protocol ), m_host( _host ), m_port( _port ), m_path( _path ), m_query( _query ), m_fragment( _fragment )
+ {
+ }
+
+ URL& operator=( const URL& u ) {
+ if( this != &u ) {
+ this->m_protocol = u.m_protocol;
+ this->m_port = u.m_port;
+ this->m_host = u.m_host;
+ this->m_path = u.m_path;
+ this->m_query = u.m_query;
+ this->m_fragment = u.m_fragment;
+ }
+ return *this;
+ }
+
+ const string protocol( ) const
+ {
+ return m_protocol;
+ }
+
+ const string host( ) const
+ {
+ return m_host;
+ }
+
+ unsigned short port( ) const
+ {
+ return m_port;
+ }
+
+ const string path( ) const
+ {
+ return m_path;
+ }
+
+ const string query( ) const
+ {
+ return m_query;
+ }
+
+ std::string fragment( ) const
+ {
+ return m_fragment;
+ }
+
+ std::string str( ) const
+ {
+ std::ostringstream os;
+ os << *this;
+ return os.str( );
+ }
+
+ static URL CRAWLER_DLL_VISIBLE Null;
+
+ bool operator!=( const URL &other ) const
+ {
+ return( str( ) != other.str( ) );
+ }
+
+ bool operator==( const URL &other ) const
+ {
+ return( str( ) == other.str( ) );
+ }
+
+ bool operator<( const URL &other ) const
+ {
+ return( str( ) < other.str( ) );
+ }
+
+ template< typename CharT, typename TraitsT > friend
+ basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u );
+
+ static unsigned short defaultPort( const std::string p )
+ {
+ if( p == "http" ) return 80;
+ else if( p == "https" ) return 443;
+ else if( p == "ftp" ) return 21;
+ else return 0;
+ }
+};
+
+template< typename CharT, typename TraitsT >
+inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u ) {
+ if( u.protocol( ).empty( ) ) {
+ return s;
+ }
+
+ s << u.protocol( ) << "://" << u.host( );
+
+ if( u.port( ) != URL::defaultPort( u.protocol( ) ) ) {
+ s << ":" << u.port( );
+ }
+
+ s << u.path( );
+
+ if( !u.query( ).empty( ) ) {
+ s << "?" << u.query( );
+ }
+
+ if( !u.fragment( ).empty( ) ) {
+ s << "#" << u.fragment( );
+ }
+
+ return s;
+}
+
+#endif
diff --git a/include/crawler/URLFilter.hpp b/include/crawler/URLFilter.hpp
new file mode 100644
index 0000000..2136009
--- /dev/null
+++ b/include/crawler/URLFilter.hpp
@@ -0,0 +1,14 @@
+#ifndef __URLFILTER_H
+#define __URLFILTER_H
+
+#include "URL.hpp"
+
+class URLFilter
+{
+ public:
+ virtual ~URLFilter( ) { };
+
+ virtual bool filter( const URL url ) = 0;
+};
+
+#endif
diff --git a/include/crawler/URLNormalizer.hpp b/include/crawler/URLNormalizer.hpp
new file mode 100644
index 0000000..af1781a
--- /dev/null
+++ b/include/crawler/URLNormalizer.hpp
@@ -0,0 +1,17 @@
+#ifndef __URLNORMALIZER_H
+#define __URLNORMALIZER_H
+
+#include <string>
+
+#include "URL.hpp"
+
+class URLNormalizer {
+ public:
+ virtual ~URLNormalizer( ) { };
+
+ virtual URL parseUrl( const std::string s ) = 0;
+
+ virtual URL normalize( const URL url, const std::string s ) = 0;
+};
+
+#endif
diff --git a/include/crawler/URLSeen.hpp b/include/crawler/URLSeen.hpp
new file mode 100644
index 0000000..742c863
--- /dev/null
+++ b/include/crawler/URLSeen.hpp
@@ -0,0 +1,12 @@
+#ifndef __URLSEEN_H
+#define __URLSEEN_H
+
+#include "URL.hpp"
+
+class URLSeen {
+ public:
+ virtual ~URLSeen( ) { };
+ virtual bool seen( const URL url ) = 0;
+};
+
+#endif
diff --git a/include/crawler/win32/errormsg.hpp b/include/crawler/win32/errormsg.hpp
new file mode 100755
index 0000000..f1ceb93
--- /dev/null
+++ b/include/crawler/win32/errormsg.hpp
@@ -0,0 +1,10 @@
+#ifndef __ERRORMSG_H
+#define __ERRORMSG_H
+
+#include <string>
+
+#include "CrawlerExportable.hpp"
+
+CRAWLER_DLL_VISIBLE std::string getLastError( );
+
+#endif
diff --git a/include/crawler/win32/stringutils.hpp b/include/crawler/win32/stringutils.hpp
new file mode 100755
index 0000000..b0404cd
--- /dev/null
+++ b/include/crawler/win32/stringutils.hpp
@@ -0,0 +1,10 @@
+#ifndef __STRINGUTILS_H
+#define __STRINGUTILS_H
+
+#include <string>
+
+#include "CrawlerExportable.hpp"
+
+CRAWLER_DLL_VISIBLE std::wstring s2ws( const std::string &s );
+
+#endif