summaryrefslogtreecommitdiff
path: root/src/modules
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-12 21:28:56 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-12 21:28:56 +0200
commit0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c (patch)
treeebd76c378f47c5b428e0c0173a1de508e9179762 /src/modules
parent799ac1861171cd58fc7036b447b931eac8722561 (diff)
downloadcrawler-0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c.tar.gz
crawler-0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c.tar.bz2
implemented the winhttp fetcher, not working yet
Diffstat (limited to 'src/modules')
-rwxr-xr-x[-rw-r--r--]src/modules/fetcher/libfetch/LibFetchFetcher.hpp3
-rwxr-xr-x[-rw-r--r--]src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp2
-rwxr-xr-xsrc/modules/fetcher/winhttp/Makefile.W321
-rwxr-xr-xsrc/modules/fetcher/winhttp/WinHttpFetcher.cpp2
-rwxr-xr-xsrc/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp238
-rwxr-xr-xsrc/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp11
6 files changed, 250 insertions, 7 deletions
diff --git a/src/modules/fetcher/libfetch/LibFetchFetcher.hpp b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp
index 1103612..902b7ef 100644..100755
--- a/src/modules/fetcher/libfetch/LibFetchFetcher.hpp
+++ b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp
@@ -13,7 +13,8 @@ class LibFetchFetcher : public Fetcher
virtual ~LibFetchFetcher( ) {
}
- virtual RewindInputStream *fetch( const URL url );
+ virtual RewindInputStream *fetch( const URL url ); virtual std::string lastErrMsg( ) const;
+
};
DECLARE_MODULE( Fetcher )
diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
index 9769561..099c0ae 100644..100755
--- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
+++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
@@ -16,7 +16,7 @@ class LibFetchRewindInputStream : public RewindInputStream
virtual void rewind( );
- string lastErrMsg( ) const;
+ virtual std::string lastErrMsg( ) const;
private:
fetchIO *m_io;
diff --git a/src/modules/fetcher/winhttp/Makefile.W32 b/src/modules/fetcher/winhttp/Makefile.W32
index ddf751a..01ee0e3 100755
--- a/src/modules/fetcher/winhttp/Makefile.W32
+++ b/src/modules/fetcher/winhttp/Makefile.W32
@@ -17,7 +17,6 @@ INCLUDE_LIBS = \
$(TOPDIR)\src\crawlingwolf.lib \
WinHttp.lib
-
DYNAMIC_MODULE = \
mod_fetcher_winhttp.dll
diff --git a/src/modules/fetcher/winhttp/WinHttpFetcher.cpp b/src/modules/fetcher/winhttp/WinHttpFetcher.cpp
index 1adc7a0..7f1a63b 100755
--- a/src/modules/fetcher/winhttp/WinHttpFetcher.cpp
+++ b/src/modules/fetcher/winhttp/WinHttpFetcher.cpp
@@ -28,7 +28,7 @@ WinHttpFetcher::~WinHttpFetcher( )
RewindInputStream *WinHttpFetcher::fetch( const URL url )
{
- WinHttpRewindInputStream *s = new WinHttpRewindInputStream( url );
+ WinHttpRewindInputStream *s = new WinHttpRewindInputStream( url, this );
return s;
}
diff --git a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp
index 35904c3..1853fdc 100755
--- a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp
+++ b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp
@@ -1,14 +1,248 @@
#include "WinHttpRewindInputStream.hpp"
+#include "WinHttpFetcher.hpp"
+#include "Logger.hpp"
-WinHttpRewindInputStream::WinHttpRewindInputStream( const URL &url )
- : RewindInputStream( url )
+#include <streambuf>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <cstring>
+#include <stdexcept>
+#include <cassert>
+#include <fstream>
+#include <cstring>
+
+#include "win32/errormsg.hpp"
+#include "win32/stringutils.hpp"
+
+using namespace std;
+
+class winhttp_buffer : public streambuf
+{
+ public:
+ explicit winhttp_buffer( HINTERNET connect, HINTERNET request, size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 );
+
+ ~winhttp_buffer( );
+
+ void rewind( );
+
+ private:
+ int_type underflow( );
+
+ private:
+ HINTERNET m_connect;
+ HINTERNET m_request;
+ const size_t m_putBack;
+ vector<char> m_buf;
+ vector<char> m_spoolBuf;
+ size_t m_spoolBufPos;
+ size_t m_spoolBufSize;
+ fstream m_spoolFile;
+ enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state;
+};
+
+winhttp_buffer::winhttp_buffer( HINTERNET connect, HINTERNET request, size_t bufSize, size_t putBack, size_t spoolBufSize )
+ : m_connect( connect ), m_request( request ), m_putBack( max( putBack, size_t( 1 ) ) ),
+ m_buf( max( bufSize, putBack ) + putBack ),
+ m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ),
+ m_spoolBufSize( 0 ), m_state( TO_SPOOL_MEMORY )
+{
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+}
+
+winhttp_buffer::~winhttp_buffer( )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ case FROM_SPOOL_MEMORY:
+ // memory only, nothing to clean up
+ break;
+
+ case TO_SPOOL_FILE:
+ case FROM_SPOOL_FILE:
+ m_spoolFile.close( );
+ (void)remove( "/tmp/spool.tmp" );
+ break;
+ }
+}
+
+streambuf::int_type winhttp_buffer::underflow( )
+{
+ // check if buffer is exhausted, if not, return current character
+ if( gptr( ) < egptr( ) )
+ return traits_type::to_int_type( *gptr( ) );
+
+ char *base = &m_buf.front( );
+ char *start = base;
+
+ // move put back away
+ if( eback( ) == base ) {
+ memmove( base, egptr( ) - m_putBack, m_putBack );
+ start += m_putBack;
+ }
+
+ // read from source or spool (depends on calling rewind)
+ DWORD size;
+ DWORD avail;
+ DWORD n;
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ case TO_SPOOL_FILE:
+ avail = 0;
+ if( !WinHttpQueryDataAvailable( m_request, &avail ) ) {
+ // TODO error handling
+ return traits_type::eof( );
+ }
+ if( avail == 0 ) {
+ return traits_type::eof( );
+ }
+
+ size = min( avail, m_buf.size( ) - ( start - base ) );
+ if( !WinHttpReadData( m_request, (LPVOID)start, size, &n ) ) {
+ // TODO error handling
+ return traits_type::eof( );
+ }
+
+ if( m_state == TO_SPOOL_MEMORY ) {
+ // as long we can "spool" to memory, do so..
+ if( m_spoolBufSize + n <= m_spoolBuf.size( ) ) {
+ m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, start, start + n );
+ m_spoolBufPos += n;
+ m_spoolBufSize += n;
+ } else {
+ // ..otherwise start spooling to disk, write
+ // current memory spool buffer first..
+ LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBuf.size( ) << ")";
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc );
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize );
+ assert( m_spoolFile.good( ) );
+ m_state = TO_SPOOL_FILE;
+ m_spoolFile.write( start, n );
+ assert( m_spoolFile.good( ) );
+ }
+ } else {
+ // we are appending to the spool file
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( start, n );
+ assert( m_spoolFile.good( ) );
+ }
+
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ }
+
+ copy( m_spoolBuf.begin( ) + m_spoolBufPos,
+ m_spoolBuf.begin( ) + m_spoolBufPos + n,
+ m_buf.begin( ) + ( start - base ) );
+
+ m_spoolBufPos += n;
+
+ break;
+
+ case FROM_SPOOL_FILE:
+
+ n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos );
+ m_spoolFile.read( start, n );
+ m_spoolBufPos += n;
+ if( m_spoolBufPos > m_spoolBufSize ) {
+ return traits_type::eof( );
+ }
+ if( n == 0 || m_spoolFile.eof( ) ) {
+ return traits_type::eof( );
+ }
+
+ break;
+ }
+
+ // set pointers
+ setg( base, start, start + n );
+
+ return traits_type::to_int_type( *gptr( ) );
+}
+
+void winhttp_buffer::rewind( )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ m_spoolBufPos = 0;
+ m_state = FROM_SPOOL_MEMORY;
+ break;
+
+ case TO_SPOOL_FILE:
+ m_spoolFile.close( );
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in );
+ m_spoolFile.seekg( 0, ios::end );
+ m_spoolBufSize = m_spoolFile.tellg( );
+ m_spoolFile.seekg( 0, ios::beg );
+ m_spoolBufPos = 0;
+ m_state = FROM_SPOOL_FILE;
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ m_spoolBufPos = 0;
+ break;
+
+ case FROM_SPOOL_FILE:
+ m_spoolBufPos = 0;
+ m_spoolFile.seekg( 0, ios::beg );
+ break;
+ }
+
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+ pubseekpos( 0, ios_base::in );
+}
+
+WinHttpRewindInputStream::WinHttpRewindInputStream( const URL &url, WinHttpFetcher *fetcher )
+ : RewindInputStream( url ), m_fetcher( fetcher ), m_connect( 0 ), m_request( 0 ), m_buf( 0 )
{
+ m_connect = WinHttpConnect( m_fetcher->session( ), s2ws( url.host( ) ).c_str( ),
+ INTERNET_DEFAULT_HTTP_PORT, 0 );
+ if( !m_connect ) {
+ setstate( badbit );
+ return;
+ }
+
+ m_request = WinHttpOpenRequest( m_connect, L"GET", s2ws( url.path( ) ).c_str( ),
+ NULL, WINHTTP_NO_REFERER, NULL, NULL );
+ if( !m_request ) {
+ setstate( badbit );
+ return;
+ }
+
+ m_buf = new winhttp_buffer( m_connect, m_request );
+ rdbuf( m_buf );
}
WinHttpRewindInputStream::~WinHttpRewindInputStream( )
{
+ if( m_buf ) delete m_buf;
+ if( m_request ) WinHttpCloseHandle( m_request );
+ if( m_connect ) WinHttpCloseHandle( m_connect );
}
void WinHttpRewindInputStream::rewind( )
{
+ // consume rest of web request, force spooling in streambuf
+ enum { CHUNKSIZE = 1024 };
+ char buf[CHUNKSIZE];
+
+ while( good( ) && !eof( ) ) {
+ read( buf, CHUNKSIZE );
+ }
+
+ ios::clear( );
+ assert( m_buf != 0 );
+ m_buf->rewind( );
+}
+
+std::string WinHttpRewindInputStream::lastErrMsg( ) const
+{
+ return getLastError( );
}
diff --git a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp
index 7c3acfb..7d06792 100755
--- a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp
+++ b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp
@@ -8,15 +8,24 @@
#include <windows.h>
#include <winhttp.h>
+class winhttp_buffer;
+class WinHttpFetcher;
+
class WinHttpRewindInputStream : public RewindInputStream
{
public:
- WinHttpRewindInputStream( const URL &url );
+ WinHttpRewindInputStream( const URL &url, WinHttpFetcher *fetcher );
virtual ~WinHttpRewindInputStream( );
virtual void rewind( );
+
+ virtual std::string lastErrMsg( ) const;
private:
+ winhttp_buffer *m_buf;
+ WinHttpFetcher *m_fetcher;
+ HINTERNET m_connect;
+ HINTERNET m_request;
};
#endif