diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-17 16:32:44 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-17 16:32:44 +0200 |
commit | c0b159e9f992c70921eb5ca4c0f0f6d448cb9a65 (patch) | |
tree | c3795373d68b48d1bd6f334289e131dd3731d80f | |
parent | 9de77404b9fea20dc6a43b6514fa6d3f534c5070 (diff) | |
download | crawler-c0b159e9f992c70921eb5ca4c0f0f6d448cb9a65.tar.gz crawler-c0b159e9f992c70921eb5ca4c0f0f6d448cb9a65.tar.bz2 |
added a common base class for spooling rewind input stream, adapted
libfetch rewind input stream to use that one
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | TODOS | 2 | ||||
-rw-r--r-- | docs/LINKS | 1 | ||||
-rwxr-xr-x | src/GNUmakefile | 3 | ||||
-rwxr-xr-x | src/Makefile.W32 | 3 | ||||
-rw-r--r-- | src/SpoolRewindInputStream.cpp | 181 | ||||
-rwxr-xr-x | src/SpoolRewindInputStream.hpp | 51 | ||||
-rw-r--r-- | src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp | 189 | ||||
-rwxr-xr-x | src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp | 10 |
9 files changed, 257 insertions, 185 deletions
@@ -21,6 +21,6 @@ tests/*/test1 tests/*/test2 tests/*/test3 -src/crawlingwolf +src/crawler makefiles/gmake/platform.mk.vars makefiles/gmake/platform.vars @@ -2,4 +2,6 @@ loadable modules on Windows, example is the logger singleton currently - common spooling code in RewindInputStream must be extracted and used in a cascade of streams (or streambufs?) +- use traits in rewindinputstream, alternative wrappers for char/string + traits depending on underlying io stream @@ -67,4 +67,5 @@ network access libraries libCurl libfetch WinHTTP +http://www.pcs.cnu.edu/~dgame/sockets/socketsC++/sockets.html diff --git a/src/GNUmakefile b/src/GNUmakefile index 0fad567..9cc0f48 100755 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -25,7 +25,8 @@ endif LOCAL_STATIC_LIB_OBJS = \ URL.o \ - MIMEType.o + MIMEType.o \ + SpoolRewindInputStream.o CPP_OBJS = \ $(LOCAL_STATIC_LIB_OBJS) diff --git a/src/Makefile.W32 b/src/Makefile.W32 index 6231f11..5ce3b11 100755 --- a/src/Makefile.W32 +++ b/src/Makefile.W32 @@ -18,7 +18,8 @@ LOCAL_STATIC_LIB_OBJS = \ win32\errormsg.obj \ win32\stringutils.obj \ URL.obj \ - MIMEType.obj + MIMEType.obj \ + SpoolRewindInputStream.obj LOCAL_STATIC_LIB = \ crawler.lib diff --git a/src/SpoolRewindInputStream.cpp b/src/SpoolRewindInputStream.cpp new file mode 100644 index 0000000..92edc4b --- /dev/null +++ b/src/SpoolRewindInputStream.cpp @@ -0,0 +1,181 @@ +#include "SpoolRewindInputStream.hpp" +#include "Logger.hpp" + +#include <algorithm> +#include <cstring> +#include <cassert> + +using namespace std; + +spool_streambuf::spool_streambuf( size_t bufSize, size_t putBack, size_t spoolBufSize ) + : m_putBack( max( putBack, size_t( 1 ) ) ), + m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ), + m_spoolBufSize( 0 ), m_state( TO_SPOOL_MEMORY ), + m_buf( max( bufSize, putBack ) + putBack ), + m_base( 0 ), m_start( 0 ) +{ + char *end = &m_buf.front( ) + m_buf.size( ); + setg( end, end, end ); +} + +spool_streambuf::~spool_streambuf( ) +{ + switch( m_state ) { + case TO_SPOOL_MEMORY: + case FROM_SPOOL_MEMORY: + // memory only, nothing to clean up + break; + + case TO_SPOOL_FILE: + case FROM_SPOOL_FILE: + m_spoolFile.close( ); + (void)remove( "/tmp/spool.tmp" ); + break; + } +} + +streambuf::int_type spool_streambuf::underflow( ) +{ + // check if buffer is exhausted, if not, return current character + if( gptr( ) < egptr( ) ) + return traits_type::to_int_type( *gptr( ) ); + + m_base = &m_buf.front( ); + m_start = m_base; + + // move put back away + if( eback( ) == m_base ) { + memmove( m_base, egptr( ) - m_putBack, m_putBack ); + m_start += m_putBack; + } + + // read from source or spool (depends on calling rewind) + streambuf::int_type n; + switch( m_state ) { + case TO_SPOOL_MEMORY: + case TO_SPOOL_FILE: + n = readFromSource( ); + if( n == 0 ) { + return traits_type::eof( ); + } else if( n < 0 ) { + // TODO handle error + } + + if( m_state == TO_SPOOL_MEMORY ) { + // as long we can "spool" to memory, do so.. + if( m_spoolBufSize + n <= m_spoolBuf.size( ) ) { + m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n ); + m_spoolBufPos += n; + m_spoolBufSize += n; + } else { + // ..otherwise start spooling to disk, write + // current memory spool buffer first.. + LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBuf.size( ) << ")"; + m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc ); + assert( m_spoolFile.good( ) ); + m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize ); + assert( m_spoolFile.good( ) ); + m_state = TO_SPOOL_FILE; + m_spoolFile.write( m_start, n ); + assert( m_spoolFile.good( ) ); + } + } else { + // we are appending to the spool file + assert( m_spoolFile.good( ) ); + m_spoolFile.write( m_start, n ); + assert( m_spoolFile.good( ) ); + } + + break; + + case FROM_SPOOL_MEMORY: + n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos ); + if( n == 0 ) { + return traits_type::eof( ); + } + + copy( m_spoolBuf.begin( ) + m_spoolBufPos, + m_spoolBuf.begin( ) + m_spoolBufPos + n, + m_buf.begin( ) + ( m_start - m_base ) ); + + m_spoolBufPos += n; + + break; + + case FROM_SPOOL_FILE: + + n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos ); + m_spoolFile.read( m_start, n ); + m_spoolBufPos += n; + if( m_spoolBufPos > m_spoolBufSize ) { + return traits_type::eof( ); + } + if( n == 0 || m_spoolFile.eof( ) ) { + return traits_type::eof( ); + } + + break; + } + + // set pointers + setg( m_base, m_start, m_start + n ); + + return traits_type::to_int_type( *gptr( ) ); +} + +void spool_streambuf::rewind( ) +{ + switch( m_state ) { + case TO_SPOOL_MEMORY: + m_spoolBufPos = 0; + m_state = FROM_SPOOL_MEMORY; + break; + + case TO_SPOOL_FILE: + m_spoolFile.close( ); + m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in ); + m_spoolFile.seekg( 0, ios::end ); + m_spoolBufSize = m_spoolFile.tellg( ); + m_spoolFile.seekg( 0, ios::beg ); + m_spoolBufPos = 0; + m_state = FROM_SPOOL_FILE; + break; + + case FROM_SPOOL_MEMORY: + m_spoolBufPos = 0; + break; + + case FROM_SPOOL_FILE: + m_spoolBufPos = 0; + m_spoolFile.seekg( 0, ios::beg ); + break; + } + + char *end = &m_buf.front( ) + m_buf.size( ); + setg( end, end, end ); + pubseekpos( 0, ios_base::in ); +} + +SpoolRewindInputStream::SpoolRewindInputStream( const URL &url ) + : RewindInputStream( url ), m_buf( 0 ) +{ +} + +SpoolRewindInputStream::~SpoolRewindInputStream( ) +{ +} + +void SpoolRewindInputStream::rewind( ) +{ + // consume rest of web request, force spooling in streambuf + enum { CHUNKSIZE = 1024 }; + char buf[CHUNKSIZE]; + + while( good( ) && !eof( ) ) { + read( buf, CHUNKSIZE ); + } + + ios::clear( ); + assert( m_buf != 0 ); + m_buf->rewind( ); +} diff --git a/src/SpoolRewindInputStream.hpp b/src/SpoolRewindInputStream.hpp new file mode 100755 index 0000000..aff593d --- /dev/null +++ b/src/SpoolRewindInputStream.hpp @@ -0,0 +1,51 @@ +#ifndef __SPOOLREWINDINPUTSTREAM_H +#define __SPOOLREWINDINPUTSTREAM_H + +#include "RewindInputStream.hpp" + +#include <iostream> +#include <fstream> +#include <vector> + +class spool_streambuf : public std::streambuf +{ + public: + explicit spool_streambuf( size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 ); + + ~spool_streambuf( ); + + void rewind( ); + + protected: + virtual std::streambuf::int_type readFromSource( ) = 0; + + private: + int_type underflow( ); + + private: + const size_t m_putBack; + std::vector<char> m_spoolBuf; + size_t m_spoolBufPos; + size_t m_spoolBufSize; + std::fstream m_spoolFile; + enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state; + + protected: + std::vector<char> m_buf; + char *m_base; + char *m_start; +}; + +class SpoolRewindInputStream : public RewindInputStream +{ + public: + SpoolRewindInputStream( const URL &url ); + virtual ~SpoolRewindInputStream( ); + + virtual void rewind( ); + + protected: + spool_streambuf *m_buf; +}; + +#endif diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp index 19c0f60..ee606fe 100644 --- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp +++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp @@ -1,192 +1,48 @@ #include "LibFetchRewindInputStream.hpp" #include "Logger.hpp" +#include "SpoolRewindInputStream.hpp" -#include <streambuf> -#include <vector> -#include <algorithm> #include <string> -#include <cstring> #include <stdexcept> -#include <cassert> -#include <fstream> #include <cstring> #include <errno.h> using namespace std; -class libfetch_buffer : public streambuf +class libfetch_buffer : public spool_streambuf { public: + explicit libfetch_buffer( fetchIO *io, size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 ); - - ~libfetch_buffer( ); - - void rewind( ); - private: - int_type underflow( ); + protected: + virtual streambuf::int_type readFromSource( ); + private: fetchIO *m_io; - const size_t m_putBack; - vector<char> m_buf; - vector<char> m_spoolBuf; - size_t m_spoolBufPos; - size_t m_spoolBufSize; - fstream m_spoolFile; - enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state; }; libfetch_buffer::libfetch_buffer( fetchIO *io, size_t bufSize, size_t putBack, size_t spoolBufSize ) - : m_io( io ), m_putBack( max( putBack, size_t( 1 ) ) ), - m_buf( max( bufSize, putBack ) + putBack ), - m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ), - m_spoolBufSize( 0 ), m_state( TO_SPOOL_MEMORY ) + : spool_streambuf( bufSize, putBack, spoolBufSize ), m_io( io ) { - char *end = &m_buf.front( ) + m_buf.size( ); - setg( end, end, end ); -} - -libfetch_buffer::~libfetch_buffer( ) -{ - switch( m_state ) { - case TO_SPOOL_MEMORY: - case FROM_SPOOL_MEMORY: - // memory only, nothing to clean up - break; - - case TO_SPOOL_FILE: - case FROM_SPOOL_FILE: - m_spoolFile.close( ); - (void)remove( "/tmp/spool.tmp" ); - break; - } } -streambuf::int_type libfetch_buffer::underflow( ) +streambuf::int_type libfetch_buffer::readFromSource( ) { - // check if buffer is exhausted, if not, return current character - if( gptr( ) < egptr( ) ) - return traits_type::to_int_type( *gptr( ) ); - - char *base = &m_buf.front( ); - char *start = base; - - // move put back away - if( eback( ) == base ) { - memmove( base, egptr( ) - m_putBack, m_putBack ); - start += m_putBack; - } - - // read from source or spool (depends on calling rewind) ssize_t n; - switch( m_state ) { - case TO_SPOOL_MEMORY: - case TO_SPOOL_FILE: - n = fetchIO_read( m_io, start, m_buf.size( ) - ( start - base ) ); - if( n == 0 ) { - return traits_type::eof( ); - } else if( n < 0 ) { - // TODO handle error - } - - if( m_state == TO_SPOOL_MEMORY ) { - // as long we can "spool" to memory, do so.. - if( m_spoolBufSize + n <= m_spoolBuf.size( ) ) { - m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, start, start + n ); - m_spoolBufPos += n; - m_spoolBufSize += n; - } else { - // ..otherwise start spooling to disk, write - // current memory spool buffer first.. - LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBuf.size( ) << ")"; - m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc ); - assert( m_spoolFile.good( ) ); - m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize ); - assert( m_spoolFile.good( ) ); - m_state = TO_SPOOL_FILE; - m_spoolFile.write( start, n ); - assert( m_spoolFile.good( ) ); - } - } else { - // we are appending to the spool file - assert( m_spoolFile.good( ) ); - m_spoolFile.write( start, n ); - assert( m_spoolFile.good( ) ); - } - - break; - - case FROM_SPOOL_MEMORY: - n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos ); - if( n == 0 ) { - return traits_type::eof( ); - } - - copy( m_spoolBuf.begin( ) + m_spoolBufPos, - m_spoolBuf.begin( ) + m_spoolBufPos + n, - m_buf.begin( ) + ( start - base ) ); - - m_spoolBufPos += n; - - break; - case FROM_SPOOL_FILE: - - n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos ); - m_spoolFile.read( start, n ); - m_spoolBufPos += n; - if( m_spoolBufPos > m_spoolBufSize ) { - return traits_type::eof( ); - } - if( n == 0 || m_spoolFile.eof( ) ) { - return traits_type::eof( ); - } - - break; - } - - // set pointers - setg( base, start, start + n ); - - return traits_type::to_int_type( *gptr( ) ); -} - -void libfetch_buffer::rewind( ) -{ - switch( m_state ) { - case TO_SPOOL_MEMORY: - m_spoolBufPos = 0; - m_state = FROM_SPOOL_MEMORY; - break; - - case TO_SPOOL_FILE: - m_spoolFile.close( ); - m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in ); - m_spoolFile.seekg( 0, ios::end ); - m_spoolBufSize = m_spoolFile.tellg( ); - m_spoolFile.seekg( 0, ios::beg ); - m_spoolBufPos = 0; - m_state = FROM_SPOOL_FILE; - break; - - case FROM_SPOOL_MEMORY: - m_spoolBufPos = 0; - break; - - case FROM_SPOOL_FILE: - m_spoolBufPos = 0; - m_spoolFile.seekg( 0, ios::beg ); - break; + n = fetchIO_read( m_io, m_start, m_buf.size( ) - ( m_start - m_base ) ); + if( n == 0 ) { + return traits_type::eof( ); + } else if( n < 0 ) { + // TODO handle error } - - char *end = &m_buf.front( ) + m_buf.size( ); - setg( end, end, end ); - pubseekpos( 0, ios_base::in ); + return n; } LibFetchRewindInputStream::LibFetchRewindInputStream( const URL &url ) - : RewindInputStream( url ), m_io( 0 ), m_buf( 0 ) + : SpoolRewindInputStream( url ), m_io( 0 ) { m_io = fetchGetURL( url.str( ).c_str( ), "" ); if( m_io == NULL ) { @@ -203,21 +59,6 @@ LibFetchRewindInputStream::~LibFetchRewindInputStream( ) if( m_io ) fetchIO_close( m_io ); } -void LibFetchRewindInputStream::rewind( ) -{ - // consume rest of web request, force spooling in streambuf - enum { CHUNKSIZE = 1024 }; - char buf[CHUNKSIZE]; - - while( good( ) && !eof( ) ) { - read( buf, CHUNKSIZE ); - } - - ios::clear( ); - assert( m_buf != 0 ); - m_buf->rewind( ); -} - string LibFetchRewindInputStream::lastErrMsg( ) const { return fetchLastErrString; diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp index 099c0ae..9347b77 100755 --- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp +++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp @@ -1,26 +1,20 @@ #ifndef __LIBFETCH_REWIND_INPUT_STREAM_H #define __LIBFETCH_REWIND_INPUT_STREAM_H -#include "RewindInputStream.hpp" -#include "URL.hpp" +#include "SpoolRewindInputStream.hpp" #include "fetch.h" -class libfetch_buffer; - -class LibFetchRewindInputStream : public RewindInputStream +class LibFetchRewindInputStream : public SpoolRewindInputStream { public: LibFetchRewindInputStream( const URL &url ); virtual ~LibFetchRewindInputStream( ); - virtual void rewind( ); - virtual std::string lastErrMsg( ) const; private: fetchIO *m_io; - libfetch_buffer *m_buf; }; #endif |