summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-17 16:32:44 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-17 16:32:44 +0200
commitc0b159e9f992c70921eb5ca4c0f0f6d448cb9a65 (patch)
treec3795373d68b48d1bd6f334289e131dd3731d80f
parent9de77404b9fea20dc6a43b6514fa6d3f534c5070 (diff)
downloadcrawler-c0b159e9f992c70921eb5ca4c0f0f6d448cb9a65.tar.gz
crawler-c0b159e9f992c70921eb5ca4c0f0f6d448cb9a65.tar.bz2
added a common base class for spooling rewind input stream, adapted
libfetch rewind input stream to use that one
-rw-r--r--.gitignore2
-rw-r--r--TODOS2
-rw-r--r--docs/LINKS1
-rwxr-xr-xsrc/GNUmakefile3
-rwxr-xr-xsrc/Makefile.W323
-rw-r--r--src/SpoolRewindInputStream.cpp181
-rwxr-xr-xsrc/SpoolRewindInputStream.hpp51
-rw-r--r--src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp189
-rwxr-xr-xsrc/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp10
9 files changed, 257 insertions, 185 deletions
diff --git a/.gitignore b/.gitignore
index 9597a02..258f6e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,6 @@
tests/*/test1
tests/*/test2
tests/*/test3
-src/crawlingwolf
+src/crawler
makefiles/gmake/platform.mk.vars
makefiles/gmake/platform.vars
diff --git a/TODOS b/TODOS
index 5603cce..47ad6ae 100644
--- a/TODOS
+++ b/TODOS
@@ -2,4 +2,6 @@
loadable modules on Windows, example is the logger singleton currently
- common spooling code in RewindInputStream must be extracted and
used in a cascade of streams (or streambufs?)
+- use traits in rewindinputstream, alternative wrappers for char/string
+ traits depending on underlying io stream
diff --git a/docs/LINKS b/docs/LINKS
index 05bf9f2..e9dd9f8 100644
--- a/docs/LINKS
+++ b/docs/LINKS
@@ -67,4 +67,5 @@ network access libraries
libCurl
libfetch
WinHTTP
+http://www.pcs.cnu.edu/~dgame/sockets/socketsC++/sockets.html
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 0fad567..9cc0f48 100755
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -25,7 +25,8 @@ endif
LOCAL_STATIC_LIB_OBJS = \
URL.o \
- MIMEType.o
+ MIMEType.o \
+ SpoolRewindInputStream.o
CPP_OBJS = \
$(LOCAL_STATIC_LIB_OBJS)
diff --git a/src/Makefile.W32 b/src/Makefile.W32
index 6231f11..5ce3b11 100755
--- a/src/Makefile.W32
+++ b/src/Makefile.W32
@@ -18,7 +18,8 @@ LOCAL_STATIC_LIB_OBJS = \
win32\errormsg.obj \
win32\stringutils.obj \
URL.obj \
- MIMEType.obj
+ MIMEType.obj \
+ SpoolRewindInputStream.obj
LOCAL_STATIC_LIB = \
crawler.lib
diff --git a/src/SpoolRewindInputStream.cpp b/src/SpoolRewindInputStream.cpp
new file mode 100644
index 0000000..92edc4b
--- /dev/null
+++ b/src/SpoolRewindInputStream.cpp
@@ -0,0 +1,181 @@
+#include "SpoolRewindInputStream.hpp"
+#include "Logger.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+
+using namespace std;
+
+spool_streambuf::spool_streambuf( size_t bufSize, size_t putBack, size_t spoolBufSize )
+ : m_putBack( max( putBack, size_t( 1 ) ) ),
+ m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ),
+ m_spoolBufSize( 0 ), m_state( TO_SPOOL_MEMORY ),
+ m_buf( max( bufSize, putBack ) + putBack ),
+ m_base( 0 ), m_start( 0 )
+{
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+}
+
+spool_streambuf::~spool_streambuf( )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ case FROM_SPOOL_MEMORY:
+ // memory only, nothing to clean up
+ break;
+
+ case TO_SPOOL_FILE:
+ case FROM_SPOOL_FILE:
+ m_spoolFile.close( );
+ (void)remove( "/tmp/spool.tmp" );
+ break;
+ }
+}
+
+streambuf::int_type spool_streambuf::underflow( )
+{
+ // check if buffer is exhausted, if not, return current character
+ if( gptr( ) < egptr( ) )
+ return traits_type::to_int_type( *gptr( ) );
+
+ m_base = &m_buf.front( );
+ m_start = m_base;
+
+ // move put back away
+ if( eback( ) == m_base ) {
+ memmove( m_base, egptr( ) - m_putBack, m_putBack );
+ m_start += m_putBack;
+ }
+
+ // read from source or spool (depends on calling rewind)
+ streambuf::int_type n;
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ case TO_SPOOL_FILE:
+ n = readFromSource( );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ } else if( n < 0 ) {
+ // TODO handle error
+ }
+
+ if( m_state == TO_SPOOL_MEMORY ) {
+ // as long we can "spool" to memory, do so..
+ if( m_spoolBufSize + n <= m_spoolBuf.size( ) ) {
+ m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n );
+ m_spoolBufPos += n;
+ m_spoolBufSize += n;
+ } else {
+ // ..otherwise start spooling to disk, write
+ // current memory spool buffer first..
+ LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBuf.size( ) << ")";
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc );
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize );
+ assert( m_spoolFile.good( ) );
+ m_state = TO_SPOOL_FILE;
+ m_spoolFile.write( m_start, n );
+ assert( m_spoolFile.good( ) );
+ }
+ } else {
+ // we are appending to the spool file
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( m_start, n );
+ assert( m_spoolFile.good( ) );
+ }
+
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ }
+
+ copy( m_spoolBuf.begin( ) + m_spoolBufPos,
+ m_spoolBuf.begin( ) + m_spoolBufPos + n,
+ m_buf.begin( ) + ( m_start - m_base ) );
+
+ m_spoolBufPos += n;
+
+ break;
+
+ case FROM_SPOOL_FILE:
+
+ n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos );
+ m_spoolFile.read( m_start, n );
+ m_spoolBufPos += n;
+ if( m_spoolBufPos > m_spoolBufSize ) {
+ return traits_type::eof( );
+ }
+ if( n == 0 || m_spoolFile.eof( ) ) {
+ return traits_type::eof( );
+ }
+
+ break;
+ }
+
+ // set pointers
+ setg( m_base, m_start, m_start + n );
+
+ return traits_type::to_int_type( *gptr( ) );
+}
+
+void spool_streambuf::rewind( )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ m_spoolBufPos = 0;
+ m_state = FROM_SPOOL_MEMORY;
+ break;
+
+ case TO_SPOOL_FILE:
+ m_spoolFile.close( );
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in );
+ m_spoolFile.seekg( 0, ios::end );
+ m_spoolBufSize = m_spoolFile.tellg( );
+ m_spoolFile.seekg( 0, ios::beg );
+ m_spoolBufPos = 0;
+ m_state = FROM_SPOOL_FILE;
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ m_spoolBufPos = 0;
+ break;
+
+ case FROM_SPOOL_FILE:
+ m_spoolBufPos = 0;
+ m_spoolFile.seekg( 0, ios::beg );
+ break;
+ }
+
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+ pubseekpos( 0, ios_base::in );
+}
+
+SpoolRewindInputStream::SpoolRewindInputStream( const URL &url )
+ : RewindInputStream( url ), m_buf( 0 )
+{
+}
+
+SpoolRewindInputStream::~SpoolRewindInputStream( )
+{
+}
+
+void SpoolRewindInputStream::rewind( )
+{
+ // consume rest of web request, force spooling in streambuf
+ enum { CHUNKSIZE = 1024 };
+ char buf[CHUNKSIZE];
+
+ while( good( ) && !eof( ) ) {
+ read( buf, CHUNKSIZE );
+ }
+
+ ios::clear( );
+ assert( m_buf != 0 );
+ m_buf->rewind( );
+}
diff --git a/src/SpoolRewindInputStream.hpp b/src/SpoolRewindInputStream.hpp
new file mode 100755
index 0000000..aff593d
--- /dev/null
+++ b/src/SpoolRewindInputStream.hpp
@@ -0,0 +1,51 @@
+#ifndef __SPOOLREWINDINPUTSTREAM_H
+#define __SPOOLREWINDINPUTSTREAM_H
+
+#include "RewindInputStream.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+class spool_streambuf : public std::streambuf
+{
+ public:
+ explicit spool_streambuf( size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 );
+
+ ~spool_streambuf( );
+
+ void rewind( );
+
+ protected:
+ virtual std::streambuf::int_type readFromSource( ) = 0;
+
+ private:
+ int_type underflow( );
+
+ private:
+ const size_t m_putBack;
+ std::vector<char> m_spoolBuf;
+ size_t m_spoolBufPos;
+ size_t m_spoolBufSize;
+ std::fstream m_spoolFile;
+ enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state;
+
+ protected:
+ std::vector<char> m_buf;
+ char *m_base;
+ char *m_start;
+};
+
+class SpoolRewindInputStream : public RewindInputStream
+{
+ public:
+ SpoolRewindInputStream( const URL &url );
+ virtual ~SpoolRewindInputStream( );
+
+ virtual void rewind( );
+
+ protected:
+ spool_streambuf *m_buf;
+};
+
+#endif
diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp
index 19c0f60..ee606fe 100644
--- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp
+++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.cpp
@@ -1,192 +1,48 @@
#include "LibFetchRewindInputStream.hpp"
#include "Logger.hpp"
+#include "SpoolRewindInputStream.hpp"
-#include <streambuf>
-#include <vector>
-#include <algorithm>
#include <string>
-#include <cstring>
#include <stdexcept>
-#include <cassert>
-#include <fstream>
#include <cstring>
#include <errno.h>
using namespace std;
-class libfetch_buffer : public streambuf
+class libfetch_buffer : public spool_streambuf
{
public:
+
explicit libfetch_buffer( fetchIO *io, size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 );
-
- ~libfetch_buffer( );
-
- void rewind( );
- private:
- int_type underflow( );
+ protected:
+ virtual streambuf::int_type readFromSource( );
+
private:
fetchIO *m_io;
- const size_t m_putBack;
- vector<char> m_buf;
- vector<char> m_spoolBuf;
- size_t m_spoolBufPos;
- size_t m_spoolBufSize;
- fstream m_spoolFile;
- enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state;
};
libfetch_buffer::libfetch_buffer( fetchIO *io, size_t bufSize, size_t putBack, size_t spoolBufSize )
- : m_io( io ), m_putBack( max( putBack, size_t( 1 ) ) ),
- m_buf( max( bufSize, putBack ) + putBack ),
- m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ),
- m_spoolBufSize( 0 ), m_state( TO_SPOOL_MEMORY )
+ : spool_streambuf( bufSize, putBack, spoolBufSize ), m_io( io )
{
- char *end = &m_buf.front( ) + m_buf.size( );
- setg( end, end, end );
-}
-
-libfetch_buffer::~libfetch_buffer( )
-{
- switch( m_state ) {
- case TO_SPOOL_MEMORY:
- case FROM_SPOOL_MEMORY:
- // memory only, nothing to clean up
- break;
-
- case TO_SPOOL_FILE:
- case FROM_SPOOL_FILE:
- m_spoolFile.close( );
- (void)remove( "/tmp/spool.tmp" );
- break;
- }
}
-streambuf::int_type libfetch_buffer::underflow( )
+streambuf::int_type libfetch_buffer::readFromSource( )
{
- // check if buffer is exhausted, if not, return current character
- if( gptr( ) < egptr( ) )
- return traits_type::to_int_type( *gptr( ) );
-
- char *base = &m_buf.front( );
- char *start = base;
-
- // move put back away
- if( eback( ) == base ) {
- memmove( base, egptr( ) - m_putBack, m_putBack );
- start += m_putBack;
- }
-
- // read from source or spool (depends on calling rewind)
ssize_t n;
- switch( m_state ) {
- case TO_SPOOL_MEMORY:
- case TO_SPOOL_FILE:
- n = fetchIO_read( m_io, start, m_buf.size( ) - ( start - base ) );
- if( n == 0 ) {
- return traits_type::eof( );
- } else if( n < 0 ) {
- // TODO handle error
- }
-
- if( m_state == TO_SPOOL_MEMORY ) {
- // as long we can "spool" to memory, do so..
- if( m_spoolBufSize + n <= m_spoolBuf.size( ) ) {
- m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, start, start + n );
- m_spoolBufPos += n;
- m_spoolBufSize += n;
- } else {
- // ..otherwise start spooling to disk, write
- // current memory spool buffer first..
- LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBuf.size( ) << ")";
- m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc );
- assert( m_spoolFile.good( ) );
- m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize );
- assert( m_spoolFile.good( ) );
- m_state = TO_SPOOL_FILE;
- m_spoolFile.write( start, n );
- assert( m_spoolFile.good( ) );
- }
- } else {
- // we are appending to the spool file
- assert( m_spoolFile.good( ) );
- m_spoolFile.write( start, n );
- assert( m_spoolFile.good( ) );
- }
-
- break;
-
- case FROM_SPOOL_MEMORY:
- n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos );
- if( n == 0 ) {
- return traits_type::eof( );
- }
-
- copy( m_spoolBuf.begin( ) + m_spoolBufPos,
- m_spoolBuf.begin( ) + m_spoolBufPos + n,
- m_buf.begin( ) + ( start - base ) );
-
- m_spoolBufPos += n;
-
- break;
- case FROM_SPOOL_FILE:
-
- n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos );
- m_spoolFile.read( start, n );
- m_spoolBufPos += n;
- if( m_spoolBufPos > m_spoolBufSize ) {
- return traits_type::eof( );
- }
- if( n == 0 || m_spoolFile.eof( ) ) {
- return traits_type::eof( );
- }
-
- break;
- }
-
- // set pointers
- setg( base, start, start + n );
-
- return traits_type::to_int_type( *gptr( ) );
-}
-
-void libfetch_buffer::rewind( )
-{
- switch( m_state ) {
- case TO_SPOOL_MEMORY:
- m_spoolBufPos = 0;
- m_state = FROM_SPOOL_MEMORY;
- break;
-
- case TO_SPOOL_FILE:
- m_spoolFile.close( );
- m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in );
- m_spoolFile.seekg( 0, ios::end );
- m_spoolBufSize = m_spoolFile.tellg( );
- m_spoolFile.seekg( 0, ios::beg );
- m_spoolBufPos = 0;
- m_state = FROM_SPOOL_FILE;
- break;
-
- case FROM_SPOOL_MEMORY:
- m_spoolBufPos = 0;
- break;
-
- case FROM_SPOOL_FILE:
- m_spoolBufPos = 0;
- m_spoolFile.seekg( 0, ios::beg );
- break;
+ n = fetchIO_read( m_io, m_start, m_buf.size( ) - ( m_start - m_base ) );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ } else if( n < 0 ) {
+ // TODO handle error
}
-
- char *end = &m_buf.front( ) + m_buf.size( );
- setg( end, end, end );
- pubseekpos( 0, ios_base::in );
+ return n;
}
LibFetchRewindInputStream::LibFetchRewindInputStream( const URL &url )
- : RewindInputStream( url ), m_io( 0 ), m_buf( 0 )
+ : SpoolRewindInputStream( url ), m_io( 0 )
{
m_io = fetchGetURL( url.str( ).c_str( ), "" );
if( m_io == NULL ) {
@@ -203,21 +59,6 @@ LibFetchRewindInputStream::~LibFetchRewindInputStream( )
if( m_io ) fetchIO_close( m_io );
}
-void LibFetchRewindInputStream::rewind( )
-{
- // consume rest of web request, force spooling in streambuf
- enum { CHUNKSIZE = 1024 };
- char buf[CHUNKSIZE];
-
- while( good( ) && !eof( ) ) {
- read( buf, CHUNKSIZE );
- }
-
- ios::clear( );
- assert( m_buf != 0 );
- m_buf->rewind( );
-}
-
string LibFetchRewindInputStream::lastErrMsg( ) const
{
return fetchLastErrString;
diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
index 099c0ae..9347b77 100755
--- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
+++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
@@ -1,26 +1,20 @@
#ifndef __LIBFETCH_REWIND_INPUT_STREAM_H
#define __LIBFETCH_REWIND_INPUT_STREAM_H
-#include "RewindInputStream.hpp"
-#include "URL.hpp"
+#include "SpoolRewindInputStream.hpp"
#include "fetch.h"
-class libfetch_buffer;
-
-class LibFetchRewindInputStream : public RewindInputStream
+class LibFetchRewindInputStream : public SpoolRewindInputStream
{
public:
LibFetchRewindInputStream( const URL &url );
virtual ~LibFetchRewindInputStream( );
- virtual void rewind( );
-
virtual std::string lastErrMsg( ) const;
private:
fetchIO *m_io;
- libfetch_buffer *m_buf;
};
#endif