summaryrefslogtreecommitdiff
path: root/src/modules/fetcher/libfetch
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-09 20:42:02 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-09 20:42:02 +0200
commit4553fe0c3c40eb30648c0e7349f94f1458e5f4b1 (patch)
treeceb153d7b59891123d714ce9588ec1f2013cd7af /src/modules/fetcher/libfetch
parent3d8e384820f30633361987301896a0a6a72ce4b0 (diff)
downloadcrawler-4553fe0c3c40eb30648c0e7349f94f1458e5f4b1.tar.gz
crawler-4553fe0c3c40eb30648c0e7349f94f1458e5f4b1.tar.bz2
added spooling to LibFetchRewindInputStream in order to support rewind
(current a memory spooling, disk spooling a bit later) added MIME detection to crawler, parse links only in HTML
Diffstat (limited to 'src/modules/fetcher/libfetch')
-rw-r--r--src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp85
-rw-r--r--src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp2
2 files changed, 78 insertions, 9 deletions
diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp
index 4e837c8..22d1434 100644
--- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp
+++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp
@@ -5,14 +5,17 @@
#include <algorithm>
#include <string>
#include <cstring>
+#include <stdexcept>
using namespace std;
class libfetch_buffer : public streambuf
{
public:
- explicit libfetch_buffer( fetchIO *io, size_t bufSize = 256, size_t putBack = 1 );
+ explicit libfetch_buffer( fetchIO *io, size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 );
+ void rewind( );
+
private:
int_type underflow( );
@@ -20,11 +23,17 @@ class libfetch_buffer : public streambuf
fetchIO *m_io;
const size_t m_putBack;
vector<char> m_buf;
+ vector<char> m_spoolBuf;
+ size_t m_spoolBufPos;
+ size_t m_spoolBufSize;
+ enum { FROM_SOURCE = 1, FROM_SPOOL_MEMORY = 2 } m_readState;
};
-libfetch_buffer::libfetch_buffer( fetchIO *io, size_t bufSize, size_t putBack )
+libfetch_buffer::libfetch_buffer( fetchIO *io, size_t bufSize, size_t putBack, size_t spoolBufSize )
: m_io( io ), m_putBack( max( putBack, size_t( 1 ) ) ),
- m_buf( max( bufSize, putBack ) + putBack )
+ m_buf( max( bufSize, putBack ) + putBack ),
+ m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ),
+ m_spoolBufSize( 0 ), m_readState( FROM_SOURCE )
{
char *end = &m_buf.front( ) + m_buf.size( );
setg( end, end, end );
@@ -45,12 +54,39 @@ streambuf::int_type libfetch_buffer::underflow( )
start += m_putBack;
}
- // read from source
- ssize_t n = fetchIO_read( m_io, start, m_buf.size( ) - ( start - base ) );
- if( n == 0 ) {
- return traits_type::eof( );
- } else if( n < 0 ) {
- // TODO handle error
+ // read from source or spool (depends on calling rewind)
+ ssize_t n;
+ switch( m_readState ) {
+ case FROM_SOURCE:
+ n = fetchIO_read( m_io, start, m_buf.size( ) - ( start - base ) );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ } else if( n < 0 ) {
+ // TODO handle error
+ }
+
+ // as long we can "spool" to memory, do so..
+ if( m_spoolBufPos + n > m_spoolBuf.size( ) ) {
+ throw runtime_error( "Memory spool buffer exceeded!" );
+ }
+ m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, start, start + n );
+ m_spoolBufSize += n;
+
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ }
+
+ copy( m_spoolBuf.begin( ) + m_spoolBufPos,
+ m_spoolBuf.begin( ) + m_spoolBufPos + n,
+ m_buf.begin( ) + ( start - base ) );
+
+ m_spoolBufPos += n;
+
+ break;
}
// set pointers
@@ -59,6 +95,23 @@ streambuf::int_type libfetch_buffer::underflow( )
return traits_type::to_int_type( *gptr( ) );
}
+void libfetch_buffer::rewind( )
+{
+ switch( m_readState ) {
+ case FROM_SOURCE:
+ m_readState = FROM_SPOOL_MEMORY;
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ break;
+ }
+
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+ m_spoolBufPos = 0;
+ pubseekpos( 0, ios_base::in );
+}
+
LibFetchRewindInputStream::LibFetchRewindInputStream( const URL &url )
: RewindInputStream( url ), m_io( 0 ), m_buf( 0 )
{
@@ -77,6 +130,20 @@ LibFetchRewindInputStream::~LibFetchRewindInputStream( )
if( m_io ) fetchIO_close( m_io );
}
+void LibFetchRewindInputStream::rewind( )
+{
+ // consume rest of web request, force spooling in streambuf
+ enum { CHUNKSIZE = 1024 };
+ char buf[CHUNKSIZE];
+
+ while( good( ) && !eof( ) ) {
+ read( buf, CHUNKSIZE );
+ }
+
+ ios::clear( );
+ m_buf->rewind( );
+}
+
string LibFetchRewindInputStream::lastErrMsg( ) const
{
return fetchLastErrString;
diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
index f1896df..9769561 100644
--- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
+++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
@@ -14,6 +14,8 @@ class LibFetchRewindInputStream : public RewindInputStream
LibFetchRewindInputStream( const URL &url );
virtual ~LibFetchRewindInputStream( );
+ virtual void rewind( );
+
string lastErrMsg( ) const;
private: