summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-12 21:28:56 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-12 21:28:56 +0200
commit0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c (patch)
treeebd76c378f47c5b428e0c0173a1de508e9179762
parent799ac1861171cd58fc7036b447b931eac8722561 (diff)
downloadcrawler-0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c.tar.gz
crawler-0f0c3ec26d09f9e905fe8efcc26aa255d037ba4c.tar.bz2
implemented the winhttp fetcher, not working yet
-rw-r--r--BUGS4
-rwxr-xr-x[-rw-r--r--]src/Fetcher.hpp0
-rwxr-xr-xsrc/Makefile.W321
-rwxr-xr-xsrc/RewindInputStream.hpp3
-rwxr-xr-x[-rw-r--r--]src/modules/fetcher/libfetch/LibFetchFetcher.hpp3
-rwxr-xr-x[-rw-r--r--]src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp2
-rwxr-xr-xsrc/modules/fetcher/winhttp/Makefile.W321
-rwxr-xr-xsrc/modules/fetcher/winhttp/WinHttpFetcher.cpp2
-rwxr-xr-xsrc/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp238
-rwxr-xr-xsrc/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp11
-rwxr-xr-xsrc/win32/stringutils.cpp21
-rwxr-xr-xsrc/win32/stringutils.hpp8
12 files changed, 287 insertions, 7 deletions
diff --git a/BUGS b/BUGS
index 22d23da..596dd88 100644
--- a/BUGS
+++ b/BUGS
@@ -4,3 +4,7 @@ Windows:
which needs /DSHARED to be present for the static and the DLL version
of the module). This has to be fixed deep in the NMAKE build system.
Also this requires the derivation of names like $(OBJS:.c:.obj).
+ Maybe also the idea with one header one c file per module and the
+ /SHARED switch there is not a good idea. Push this to the makefile
+ when building *.dll or *.so to pull in a small additional file
+ XXXModule.cpp
diff --git a/src/Fetcher.hpp b/src/Fetcher.hpp
index 40f1c7a..40f1c7a 100644..100755
--- a/src/Fetcher.hpp
+++ b/src/Fetcher.hpp
diff --git a/src/Makefile.W32 b/src/Makefile.W32
index c44711d..3e1c8c8 100755
--- a/src/Makefile.W32
+++ b/src/Makefile.W32
@@ -16,6 +16,7 @@ INCLUDE_LIBS = \
LOCAL_STATIC_LIB_OBJS = \
win32\errormsg.obj \
+ win32\stringutils.obj \
URL.obj \
MIMEType.obj
diff --git a/src/RewindInputStream.hpp b/src/RewindInputStream.hpp
index 0acfc9d..39d7b6e 100755
--- a/src/RewindInputStream.hpp
+++ b/src/RewindInputStream.hpp
@@ -4,6 +4,7 @@
#include "URL.hpp"
#include <iostream>
+#include <string>
class RewindInputStream : public std::istream {
public:
@@ -14,6 +15,8 @@ class RewindInputStream : public std::istream {
virtual void rewind( ) = 0;
+ virtual std::string lastErrMsg( ) const = 0;
+
protected:
RewindInputStream( const URL &url )
diff --git a/src/modules/fetcher/libfetch/LibFetchFetcher.hpp b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp
index 1103612..902b7ef 100644..100755
--- a/src/modules/fetcher/libfetch/LibFetchFetcher.hpp
+++ b/src/modules/fetcher/libfetch/LibFetchFetcher.hpp
@@ -13,7 +13,8 @@ class LibFetchFetcher : public Fetcher
virtual ~LibFetchFetcher( ) {
}
- virtual RewindInputStream *fetch( const URL url );
+ virtual RewindInputStream *fetch( const URL url ); virtual std::string lastErrMsg( ) const;
+
};
DECLARE_MODULE( Fetcher )
diff --git a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
index 9769561..099c0ae 100644..100755
--- a/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
+++ b/src/modules/fetcher/libfetch/LibFetchRewindInputStream.hpp
@@ -16,7 +16,7 @@ class LibFetchRewindInputStream : public RewindInputStream
virtual void rewind( );
- string lastErrMsg( ) const;
+ virtual std::string lastErrMsg( ) const;
private:
fetchIO *m_io;
diff --git a/src/modules/fetcher/winhttp/Makefile.W32 b/src/modules/fetcher/winhttp/Makefile.W32
index ddf751a..01ee0e3 100755
--- a/src/modules/fetcher/winhttp/Makefile.W32
+++ b/src/modules/fetcher/winhttp/Makefile.W32
@@ -17,7 +17,6 @@ INCLUDE_LIBS = \
$(TOPDIR)\src\crawlingwolf.lib \
WinHttp.lib
-
DYNAMIC_MODULE = \
mod_fetcher_winhttp.dll
diff --git a/src/modules/fetcher/winhttp/WinHttpFetcher.cpp b/src/modules/fetcher/winhttp/WinHttpFetcher.cpp
index 1adc7a0..7f1a63b 100755
--- a/src/modules/fetcher/winhttp/WinHttpFetcher.cpp
+++ b/src/modules/fetcher/winhttp/WinHttpFetcher.cpp
@@ -28,7 +28,7 @@ WinHttpFetcher::~WinHttpFetcher( )
RewindInputStream *WinHttpFetcher::fetch( const URL url )
{
- WinHttpRewindInputStream *s = new WinHttpRewindInputStream( url );
+ WinHttpRewindInputStream *s = new WinHttpRewindInputStream( url, this );
return s;
}
diff --git a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp
index 35904c3..1853fdc 100755
--- a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp
+++ b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.cpp
@@ -1,14 +1,248 @@
#include "WinHttpRewindInputStream.hpp"
+#include "WinHttpFetcher.hpp"
+#include "Logger.hpp"
-WinHttpRewindInputStream::WinHttpRewindInputStream( const URL &url )
- : RewindInputStream( url )
+#include <streambuf>
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <cstring>
+#include <stdexcept>
+#include <cassert>
+#include <fstream>
+#include <cstring>
+
+#include "win32/errormsg.hpp"
+#include "win32/stringutils.hpp"
+
+using namespace std;
+
+class winhttp_buffer : public streambuf
+{
+ public:
+ explicit winhttp_buffer( HINTERNET connect, HINTERNET request, size_t bufSize = 256, size_t putBack = 1, size_t spoolBufSize = 8192 );
+
+ ~winhttp_buffer( );
+
+ void rewind( );
+
+ private:
+ int_type underflow( );
+
+ private:
+ HINTERNET m_connect;
+ HINTERNET m_request;
+ const size_t m_putBack;
+ vector<char> m_buf;
+ vector<char> m_spoolBuf;
+ size_t m_spoolBufPos;
+ size_t m_spoolBufSize;
+ fstream m_spoolFile;
+ enum { TO_SPOOL_MEMORY = 1, TO_SPOOL_FILE = 2, FROM_SPOOL_MEMORY = 3, FROM_SPOOL_FILE = 4 } m_state;
+};
+
+winhttp_buffer::winhttp_buffer( HINTERNET connect, HINTERNET request, size_t bufSize, size_t putBack, size_t spoolBufSize )
+ : m_connect( connect ), m_request( request ), m_putBack( max( putBack, size_t( 1 ) ) ),
+ m_buf( max( bufSize, putBack ) + putBack ),
+ m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ),
+ m_spoolBufSize( 0 ), m_state( TO_SPOOL_MEMORY )
+{
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+}
+
+winhttp_buffer::~winhttp_buffer( )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ case FROM_SPOOL_MEMORY:
+ // memory only, nothing to clean up
+ break;
+
+ case TO_SPOOL_FILE:
+ case FROM_SPOOL_FILE:
+ m_spoolFile.close( );
+ (void)remove( "/tmp/spool.tmp" );
+ break;
+ }
+}
+
+streambuf::int_type winhttp_buffer::underflow( )
+{
+ // check if buffer is exhausted, if not, return current character
+ if( gptr( ) < egptr( ) )
+ return traits_type::to_int_type( *gptr( ) );
+
+ char *base = &m_buf.front( );
+ char *start = base;
+
+ // move put back away
+ if( eback( ) == base ) {
+ memmove( base, egptr( ) - m_putBack, m_putBack );
+ start += m_putBack;
+ }
+
+ // read from source or spool (depends on calling rewind)
+ DWORD size;
+ DWORD avail;
+ DWORD n;
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ case TO_SPOOL_FILE:
+ avail = 0;
+ if( !WinHttpQueryDataAvailable( m_request, &avail ) ) {
+ // TODO error handling
+ return traits_type::eof( );
+ }
+ if( avail == 0 ) {
+ return traits_type::eof( );
+ }
+
+ size = min( avail, m_buf.size( ) - ( start - base ) );
+ if( !WinHttpReadData( m_request, (LPVOID)start, size, &n ) ) {
+ // TODO error handling
+ return traits_type::eof( );
+ }
+
+ if( m_state == TO_SPOOL_MEMORY ) {
+ // as long we can "spool" to memory, do so..
+ if( m_spoolBufSize + n <= m_spoolBuf.size( ) ) {
+ m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, start, start + n );
+ m_spoolBufPos += n;
+ m_spoolBufSize += n;
+ } else {
+ // ..otherwise start spooling to disk, write
+ // current memory spool buffer first..
+ LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBuf.size( ) << ")";
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc );
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize );
+ assert( m_spoolFile.good( ) );
+ m_state = TO_SPOOL_FILE;
+ m_spoolFile.write( start, n );
+ assert( m_spoolFile.good( ) );
+ }
+ } else {
+ // we are appending to the spool file
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( start, n );
+ assert( m_spoolFile.good( ) );
+ }
+
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ }
+
+ copy( m_spoolBuf.begin( ) + m_spoolBufPos,
+ m_spoolBuf.begin( ) + m_spoolBufPos + n,
+ m_buf.begin( ) + ( start - base ) );
+
+ m_spoolBufPos += n;
+
+ break;
+
+ case FROM_SPOOL_FILE:
+
+ n = min( m_buf.size( ) - ( start - base ), m_spoolBufSize - m_spoolBufPos );
+ m_spoolFile.read( start, n );
+ m_spoolBufPos += n;
+ if( m_spoolBufPos > m_spoolBufSize ) {
+ return traits_type::eof( );
+ }
+ if( n == 0 || m_spoolFile.eof( ) ) {
+ return traits_type::eof( );
+ }
+
+ break;
+ }
+
+ // set pointers
+ setg( base, start, start + n );
+
+ return traits_type::to_int_type( *gptr( ) );
+}
+
+void winhttp_buffer::rewind( )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ m_spoolBufPos = 0;
+ m_state = FROM_SPOOL_MEMORY;
+ break;
+
+ case TO_SPOOL_FILE:
+ m_spoolFile.close( );
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in );
+ m_spoolFile.seekg( 0, ios::end );
+ m_spoolBufSize = m_spoolFile.tellg( );
+ m_spoolFile.seekg( 0, ios::beg );
+ m_spoolBufPos = 0;
+ m_state = FROM_SPOOL_FILE;
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ m_spoolBufPos = 0;
+ break;
+
+ case FROM_SPOOL_FILE:
+ m_spoolBufPos = 0;
+ m_spoolFile.seekg( 0, ios::beg );
+ break;
+ }
+
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+ pubseekpos( 0, ios_base::in );
+}
+
+WinHttpRewindInputStream::WinHttpRewindInputStream( const URL &url, WinHttpFetcher *fetcher )
+ : RewindInputStream( url ), m_fetcher( fetcher ), m_connect( 0 ), m_request( 0 ), m_buf( 0 )
{
+ m_connect = WinHttpConnect( m_fetcher->session( ), s2ws( url.host( ) ).c_str( ),
+ INTERNET_DEFAULT_HTTP_PORT, 0 );
+ if( !m_connect ) {
+ setstate( badbit );
+ return;
+ }
+
+ m_request = WinHttpOpenRequest( m_connect, L"GET", s2ws( url.path( ) ).c_str( ),
+ NULL, WINHTTP_NO_REFERER, NULL, NULL );
+ if( !m_request ) {
+ setstate( badbit );
+ return;
+ }
+
+ m_buf = new winhttp_buffer( m_connect, m_request );
+ rdbuf( m_buf );
}
WinHttpRewindInputStream::~WinHttpRewindInputStream( )
{
+ if( m_buf ) delete m_buf;
+ if( m_request ) WinHttpCloseHandle( m_request );
+ if( m_connect ) WinHttpCloseHandle( m_connect );
}
void WinHttpRewindInputStream::rewind( )
{
+ // consume rest of web request, force spooling in streambuf
+ enum { CHUNKSIZE = 1024 };
+ char buf[CHUNKSIZE];
+
+ while( good( ) && !eof( ) ) {
+ read( buf, CHUNKSIZE );
+ }
+
+ ios::clear( );
+ assert( m_buf != 0 );
+ m_buf->rewind( );
+}
+
+std::string WinHttpRewindInputStream::lastErrMsg( ) const
+{
+ return getLastError( );
}
diff --git a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp
index 7c3acfb..7d06792 100755
--- a/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp
+++ b/src/modules/fetcher/winhttp/WinHttpRewindInputStream.hpp
@@ -8,15 +8,24 @@
#include <windows.h>
#include <winhttp.h>
+class winhttp_buffer;
+class WinHttpFetcher;
+
class WinHttpRewindInputStream : public RewindInputStream
{
public:
- WinHttpRewindInputStream( const URL &url );
+ WinHttpRewindInputStream( const URL &url, WinHttpFetcher *fetcher );
virtual ~WinHttpRewindInputStream( );
virtual void rewind( );
+
+ virtual std::string lastErrMsg( ) const;
private:
+ winhttp_buffer *m_buf;
+ WinHttpFetcher *m_fetcher;
+ HINTERNET m_connect;
+ HINTERNET m_request;
};
#endif
diff --git a/src/win32/stringutils.cpp b/src/win32/stringutils.cpp
new file mode 100755
index 0000000..a82dd7a
--- /dev/null
+++ b/src/win32/stringutils.cpp
@@ -0,0 +1,21 @@
+#include "errormsg.hpp"
+
+using namespace std;
+
+#define WIN32_MEAN_AND_LEAN
+#include <windows.h>
+
+std::wstring s2ws( const std::string &s )
+{
+ // get size for buffer and allocate it
+ int len;
+ int slength = (int)s.length( )+1;
+ len = MultiByteToWideChar( CP_ACP, 0, s.c_str( ), slength, 0, 0 );
+ wchar_t *buf = new wchar_t[len];
+
+ // convert
+ MultiByteToWideChar( CP_ACP, 0, s.c_str( ), slength, buf, len );
+ std::wstring res( buf );
+ delete[] buf;
+ return res;
+}
diff --git a/src/win32/stringutils.hpp b/src/win32/stringutils.hpp
new file mode 100755
index 0000000..6d4bd80
--- /dev/null
+++ b/src/win32/stringutils.hpp
@@ -0,0 +1,8 @@
+#ifndef __STRINGUTILS_H
+#define __STRINGUTILS_H
+
+#include <string>
+
+std::wstring s2ws( const std::string &s );
+
+#endif