diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-08-12 21:28:56 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-08-12 21:28:56 +0200 |
commit | 0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c (patch) | |
tree | ebd76c378f47c5b428e0c0173a1de508e9179762 /src/modules | |
parent | 799ac1861171cd58fc7036b447b931eac8722561 (diff) | |
download | crawler-0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c.tar.gz crawler-0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c.tar.bz2 |
implemented the winhttp fetcher, not working yet
Diffstat (limited to 'src/modules')
-rwxr-xr-x[-rw-r--r--] | src/modules/fetcher/libfetch/LibFetchFetcher.hpp | 3 | ||||
-rwxr-xr-x[-rw-r--r--] | src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp | 2 | ||||
-rwxr-xr-x | src/modules/fetcher/winhttp/Makefile.W32 | 1 | ||||
-rwxr-xr-x | src/modules/fetcher/winhttp/WinHttpFetcher.cpp | 2 | ||||
-rwxr-xr-x | src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp | 238 | ||||
-rwxr-xr-x | src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp | 11 |
6 files changed, 250 insertions, 7 deletions
diff --git a/src/modules/fetcher/libfetch/LibFetchFetcher.hpp b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp index 1103612..902b7ef 100644..100755 --- a/src/modules/fetcher/libfetch/LibFetchFetcher.hpp +++ b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp @@ -13,7 +13,8 @@ class LibFetchFetcher : public Fetcher virtual ~LibFetchFetcher( ) { } - virtual RewindInputStream *fetch( const URL url ); + virtual RewindInputStream *fetch( const URL url ); virtual std::string lastErrMsg( ) const; + }; DECLARE_MODULE( Fetcher ) diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp index 9769561..099c0ae 100644..100755 --- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp +++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp @@ -16,7 +16,7 @@ class LibFetchRewindInputStream : public RewindInputStream virtual void rewind( ); - string lastErrMsg( ) const; + virtual std::string lastErrMsg( ) const; private: fetchIO *m_io; diff --git a/src/modules/fetcher/winhttp/Makefile.W32 b/src/modules/fetcher/winhttp/Makefile.W32 index ddf751a..01ee0e3 100755 --- a/src/modules/fetcher/winhttp/Makefile.W32 +++ b/src/modules/fetcher/winhttp/Makefile.W32 @@ -17,7 +17,6 @@ INCLUDE_LIBS = \ $(TOPDIR)\src\crawlingwolf.lib \ WinHttp.lib - DYNAMIC_MODULE = \ mod_fetcher_winhttp.dll diff --git a/src/modules/fetcher/winhttp/WinHttpFetcher.cpp b/src/modules/fetcher/winhttp/WinHttpFetcher.cpp index 1adc7a0..7f1a63b 100755 --- a/src/modules/fetcher/winhttp/WinHttpFetcher.cpp +++ b/src/modules/fetcher/winhttp/WinHttpFetcher.cpp @@ -28,7 +28,7 @@ WinHttpFetcher::~WinHttpFetcher( ) RewindInputStream *WinHttpFetcher::fetch( const URL url ) { - WinHttpRewindInputStream *s = new WinHttpRewindInputStream( url ); + WinHttpRewindInputStream *s = new WinHttpRewindInputStream( url, this ); return s; } diff --git a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp index 35904c3..1853fdc 100755 --- a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp +++ b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp @@ -1,14 +1,248 @@ #include "WinHttpRewindInputStream.hpp" +#include "WinHttpFetcher.hpp" +#include "Logger.hpp" -WinHttpRewindInputStream::WinHttpRewindInputStream( const URL &url ) - : RewindInputStream( url ) +#include <streambuf> +#include <vector> +#include <algorithm> +#include <string> +#include <cstring> +#include <stdexcept> +#include <cassert> +#include <fstream> +#include <cstring> + +#include "win32/errormsg.hpp" +#include "win32/stringutils.hpp" + +using namespace std; + +class winhttp_buffer : public streambuf +{ + public: + explicit winhttp_buffer( HINTERNET connect, HINTERNET request, size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 ); + + ~winhttp_buffer( ); + + void rewind( ); + + private: + int_type underflow( ); + + private: + HINTERNET m_connect; + HINTERNET m_request; + const size_t m_putBack; + vector<char> m_buf; + vector<char> m_spoolBuf; + size_t m_spoolBufPos; + size_t m_spoolBufSize; + fstream m_spoolFile; + enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state; +}; + +winhttp_buffer::winhttp_buffer( HINTERNET connect, HINTERNET request, size_t bufSize, size_t putBack, size_t spoolBufSize ) + : m_connect( connect ), m_request( request ), m_putBack( max( putBack, size_t( 1 ) ) ), + m_buf( max( bufSize, putBack ) + putBack ), + m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ), + m_spoolBufSize( 0 ), m_state( TO_SPOOL_MEMORY ) +{ + char *end = &m_buf.front( ) + m_buf.size( ); + setg( end, end, end ); +} + +winhttp_buffer::~winhttp_buffer( ) +{ + switch( m_state ) { + case TO_SPOOL_MEMORY: + case FROM_SPOOL_MEMORY: + // memory only, nothing to clean up + break; + + case TO_SPOOL_FILE: + case FROM_SPOOL_FILE: + m_spoolFile.close( ); + (void)remove( "/tmp/spool.tmp" ); + break; + } +} + +streambuf::int_type winhttp_buffer::underflow( ) +{ + // check if buffer is exhausted, if not, return current character + if( gptr( ) < egptr( ) ) + return traits_type::to_int_type( *gptr( ) ); + + char *base = &m_buf.front( ); + char *start = base; + + // move put back away + if( eback( ) == base ) { + memmove( base, egptr( ) - m_putBack, m_putBack ); + start += m_putBack; + } + + // read from source or spool (depends on calling rewind) + DWORD size; + DWORD avail; + DWORD n; + switch( m_state ) { + case TO_SPOOL_MEMORY: + case TO_SPOOL_FILE: + avail = 0; + if( !WinHttpQueryDataAvailable( m_request, &avail ) ) { + // TODO error handling + return traits_type::eof( ); + } + if( avail == 0 ) { + return traits_type::eof( ); + } + + size = min( avail, m_buf.size( ) - ( start - base ) ); + if( !WinHttpReadData( m_request, (LPVOID)start, size, &n ) ) { + // TODO error handling + return traits_type::eof( ); + } + + if( m_state == TO_SPOOL_MEMORY ) { + // as long we can "spool" to memory, do so.. + if( m_spoolBufSize + n <= m_spoolBuf.size( ) ) { + m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, start, start + n ); + m_spoolBufPos += n; + m_spoolBufSize += n; + } else { + // ..otherwise start spooling to disk, write + // current memory spool buffer first.. + LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBuf.size( ) << ")"; + m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc ); + assert( m_spoolFile.good( ) ); + m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize ); + assert( m_spoolFile.good( ) ); + m_state = TO_SPOOL_FILE; + m_spoolFile.write( start, n ); + assert( m_spoolFile.good( ) ); + } + } else { + // we are appending to the spool file + assert( m_spoolFile.good( ) ); + m_spoolFile.write( start, n ); + assert( m_spoolFile.good( ) ); + } + + break; + + case FROM_SPOOL_MEMORY: + n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos ); + if( n == 0 ) { + return traits_type::eof( ); + } + + copy( m_spoolBuf.begin( ) + m_spoolBufPos, + m_spoolBuf.begin( ) + m_spoolBufPos + n, + m_buf.begin( ) + ( start - base ) ); + + m_spoolBufPos += n; + + break; + + case FROM_SPOOL_FILE: + + n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos ); + m_spoolFile.read( start, n ); + m_spoolBufPos += n; + if( m_spoolBufPos > m_spoolBufSize ) { + return traits_type::eof( ); + } + if( n == 0 || m_spoolFile.eof( ) ) { + return traits_type::eof( ); + } + + break; + } + + // set pointers + setg( base, start, start + n ); + + return traits_type::to_int_type( *gptr( ) ); +} + +void winhttp_buffer::rewind( ) +{ + switch( m_state ) { + case TO_SPOOL_MEMORY: + m_spoolBufPos = 0; + m_state = FROM_SPOOL_MEMORY; + break; + + case TO_SPOOL_FILE: + m_spoolFile.close( ); + m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in ); + m_spoolFile.seekg( 0, ios::end ); + m_spoolBufSize = m_spoolFile.tellg( ); + m_spoolFile.seekg( 0, ios::beg ); + m_spoolBufPos = 0; + m_state = FROM_SPOOL_FILE; + break; + + case FROM_SPOOL_MEMORY: + m_spoolBufPos = 0; + break; + + case FROM_SPOOL_FILE: + m_spoolBufPos = 0; + m_spoolFile.seekg( 0, ios::beg ); + break; + } + + char *end = &m_buf.front( ) + m_buf.size( ); + setg( end, end, end ); + pubseekpos( 0, ios_base::in ); +} + +WinHttpRewindInputStream::WinHttpRewindInputStream( const URL &url, WinHttpFetcher *fetcher ) + : RewindInputStream( url ), m_fetcher( fetcher ), m_connect( 0 ), m_request( 0 ), m_buf( 0 ) { + m_connect = WinHttpConnect( m_fetcher->session( ), s2ws( url.host( ) ).c_str( ), + INTERNET_DEFAULT_HTTP_PORT, 0 ); + if( !m_connect ) { + setstate( badbit ); + return; + } + + m_request = WinHttpOpenRequest( m_connect, L"GET", s2ws( url.path( ) ).c_str( ), + NULL, WINHTTP_NO_REFERER, NULL, NULL ); + if( !m_request ) { + setstate( badbit ); + return; + } + + m_buf = new winhttp_buffer( m_connect, m_request ); + rdbuf( m_buf ); } WinHttpRewindInputStream::~WinHttpRewindInputStream( ) { + if( m_buf ) delete m_buf; + if( m_request ) WinHttpCloseHandle( m_request ); + if( m_connect ) WinHttpCloseHandle( m_connect ); } void WinHttpRewindInputStream::rewind( ) { + // consume rest of web request, force spooling in streambuf + enum { CHUNKSIZE = 1024 }; + char buf[CHUNKSIZE]; + + while( good( ) && !eof( ) ) { + read( buf, CHUNKSIZE ); + } + + ios::clear( ); + assert( m_buf != 0 ); + m_buf->rewind( ); +} + +std::string WinHttpRewindInputStream::lastErrMsg( ) const +{ + return getLastError( ); } diff --git a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp index 7c3acfb..7d06792 100755 --- a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp +++ b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp @@ -8,15 +8,24 @@ #include <windows.h> #include <winhttp.h> +class winhttp_buffer; +class WinHttpFetcher; + class WinHttpRewindInputStream : public RewindInputStream { public: - WinHttpRewindInputStream( const URL &url ); + WinHttpRewindInputStream( const URL &url, WinHttpFetcher *fetcher ); virtual ~WinHttpRewindInputStream( ); virtual void rewind( ); + + virtual std::string lastErrMsg( ) const; private: + winhttp_buffer *m_buf; + WinHttpFetcher *m_fetcher; + HINTERNET m_connect; + HINTERNET m_request; }; #endif |