diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-09 20:42:02 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-09 20:42:02 +0200 |
commit | 4553fe0c3c40eb30648c0e7349f94f1458e5f4b1 (patch) | |
tree | ceb153d7b59891123d714ce9588ec1f2013cd7af /src/modules/fetcher/libfetch | |
parent | 3d8e384820f30633361987301896a0a6a72ce4b0 (diff) | |
download | crawler-4553fe0c3c40eb30648c0e7349f94f1458e5f4b1.tar.gz crawler-4553fe0c3c40eb30648c0e7349f94f1458e5f4b1.tar.bz2 |
added spooling to LibFetchRewindInputStream in order to support rewind
(current a memory spooling, disk spooling a bit later)
added MIME detection to crawler, parse links only in HTML
Diffstat (limited to 'src/modules/fetcher/libfetch')
-rw-r--r-- | src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp | 85 | ||||
-rw-r--r-- | src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp | 2 |
2 files changed, 78 insertions, 9 deletions
diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp index 4e837c8..22d1434 100644 --- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp +++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp @@ -5,14 +5,17 @@ #include <algorithm> #include <string> #include <cstring> +#include <stdexcept> using namespace std; class libfetch_buffer : public streambuf { public: - explicit libfetch_buffer( fetchIO *io, size_t bufSize = 256, size_t putBack = 1 ); + explicit libfetch_buffer( fetchIO *io, size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 ); + void rewind( ); + private: int_type underflow( ); @@ -20,11 +23,17 @@ class libfetch_buffer : public streambuf fetchIO *m_io; const size_t m_putBack; vector<char> m_buf; + vector<char> m_spoolBuf; + size_t m_spoolBufPos; + size_t m_spoolBufSize; + enum { FROM_SOURCE = 1, FROM_SPOOL_MEMORY = 2 } m_readState; }; -libfetch_buffer::libfetch_buffer( fetchIO *io, size_t bufSize, size_t putBack ) +libfetch_buffer::libfetch_buffer( fetchIO *io, size_t bufSize, size_t putBack, size_t spoolBufSize ) : m_io( io ), m_putBack( max( putBack, size_t( 1 ) ) ), - m_buf( max( bufSize, putBack ) + putBack ) + m_buf( max( bufSize, putBack ) + putBack ), + m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ), + m_spoolBufSize( 0 ), m_readState( FROM_SOURCE ) { char *end = &m_buf.front( ) + m_buf.size( ); setg( end, end, end ); @@ -45,12 +54,39 @@ streambuf::int_type libfetch_buffer::underflow( ) start += m_putBack; } - // read from source - ssize_t n = fetchIO_read( m_io, start, m_buf.size( ) - ( start - base ) ); - if( n == 0 ) { - return traits_type::eof( ); - } else if( n < 0 ) { - // TODO handle error + // read from source or spool (depends on calling rewind) + ssize_t n; + switch( m_readState ) { + case FROM_SOURCE: + n = fetchIO_read( m_io, start, m_buf.size( ) - ( start - base ) ); + if( n == 0 ) { + return traits_type::eof( ); + } else if( n < 0 ) { + // TODO handle error + } + + // as long we can "spool" to memory, do so.. + if( m_spoolBufPos + n > m_spoolBuf.size( ) ) { + throw runtime_error( "Memory spool buffer exceeded!" ); + } + m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, start, start + n ); + m_spoolBufSize += n; + + break; + + case FROM_SPOOL_MEMORY: + n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos ); + if( n == 0 ) { + return traits_type::eof( ); + } + + copy( m_spoolBuf.begin( ) + m_spoolBufPos, + m_spoolBuf.begin( ) + m_spoolBufPos + n, + m_buf.begin( ) + ( start - base ) ); + + m_spoolBufPos += n; + + break; } // set pointers @@ -59,6 +95,23 @@ streambuf::int_type libfetch_buffer::underflow( ) return traits_type::to_int_type( *gptr( ) ); } +void libfetch_buffer::rewind( ) +{ + switch( m_readState ) { + case FROM_SOURCE: + m_readState = FROM_SPOOL_MEMORY; + break; + + case FROM_SPOOL_MEMORY: + break; + } + + char *end = &m_buf.front( ) + m_buf.size( ); + setg( end, end, end ); + m_spoolBufPos = 0; + pubseekpos( 0, ios_base::in ); +} + LibFetchRewindInputStream::LibFetchRewindInputStream( const URL &url ) : RewindInputStream( url ), m_io( 0 ), m_buf( 0 ) { @@ -77,6 +130,20 @@ LibFetchRewindInputStream::~LibFetchRewindInputStream( ) if( m_io ) fetchIO_close( m_io ); } +void LibFetchRewindInputStream::rewind( ) +{ + // consume rest of web request, force spooling in streambuf + enum { CHUNKSIZE = 1024 }; + char buf[CHUNKSIZE]; + + while( good( ) && !eof( ) ) { + read( buf, CHUNKSIZE ); + } + + ios::clear( ); + m_buf->rewind( ); +} + string LibFetchRewindInputStream::lastErrMsg( ) const { return fetchLastErrString; diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp index f1896df..9769561 100644 --- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp +++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp @@ -14,6 +14,8 @@ class LibFetchRewindInputStream : public RewindInputStream LibFetchRewindInputStream( const URL &url ); virtual ~LibFetchRewindInputStream( ); + virtual void rewind( ); + string lastErrMsg( ) const; private: |