summaryrefslogtreecommitdiff
path: root/src/libcrawler
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-09-06 22:18:23 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-09-06 22:18:23 +0200
commit13fc9a7da5111f4ddba942d3c6b6b8654ce395d6 (patch)
treee86210e3d939911e35f930a6dc73c3ebb591243b /src/libcrawler
parentf5c586f7231f7e033c5528bcefea357e4e64441c (diff)
downloadcrawler-13fc9a7da5111f4ddba942d3c6b6b8654ce395d6.tar.gz
crawler-13fc9a7da5111f4ddba942d3c6b6b8654ce395d6.tar.bz2
more splitting into libcrawl, crawl binary
moved more public header to 'include' changed approach for dynamic linking on Windows
Diffstat (limited to 'src/libcrawler')
-rwxr-xr-xsrc/libcrawler/GNUmakefile42
-rw-r--r--src/libcrawler/MIMEType.cpp5
-rwxr-xr-xsrc/libcrawler/Makefile.W3245
-rw-r--r--src/libcrawler/SpoolRewindInputStream.cpp181
-rw-r--r--src/libcrawler/URL.cpp5
-rwxr-xr-xsrc/libcrawler/win32/errormsg.cpp27
-rwxr-xr-xsrc/libcrawler/win32/stringutils.cpp21
7 files changed, 326 insertions, 0 deletions
diff --git a/src/libcrawler/GNUmakefile b/src/libcrawler/GNUmakefile
new file mode 100755
index 0000000..c1e7a7f
--- /dev/null
+++ b/src/libcrawler/GNUmakefile
@@ -0,0 +1,42 @@
+TOPDIR = ../..
+
+SUBDIRS =
+
+-include $(TOPDIR)/makefiles/gmake/platform.mk
+
+INCLUDE_CPPFLAGS = \
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_DIRS = \
+ -I. \
+ -I$(TOPDIR)/include/logger \
+ -I$(TOPDIR)/include/util
+
+INCLUDE_LIBS = \
+
+STATIC_LIB = libcrawler.a
+
+DYNAMIC_LIB = libcrawler.so
+DYNAMIC_LIB_MAJOR = 0
+DYNAMIC_LIB_MINOR = 0
+DYNAMIC_LIB_PATCH = 0
+
+CPP_OBJS = \
+ URL.o \
+ MIMEType.o \
+ SpoolRewindInputStream.o
+
+-include $(TOPDIR)/makefiles/gmake/sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_install:
+
+local_uninstall:
+
+local_test:
diff --git a/src/libcrawler/MIMEType.cpp b/src/libcrawler/MIMEType.cpp
new file mode 100644
index 0000000..25dc20c
--- /dev/null
+++ b/src/libcrawler/MIMEType.cpp
@@ -0,0 +1,5 @@
+#include "MIMEType.hpp"
+
+MIMEType MIMEType::Null;
+
+
diff --git a/src/libcrawler/Makefile.W32 b/src/libcrawler/Makefile.W32
new file mode 100755
index 0000000..ab18d2c
--- /dev/null
+++ b/src/libcrawler/Makefile.W32
@@ -0,0 +1,45 @@
+TOPDIR = ..\..
+
+SUBDIRS =
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
+
+INCLUDE_CXXFLAGS = \
+ /D_WIN32_WINNT=0x504 \
+ /DBUILDING_CRAWLER
+
+INCLUDE_DIRS = \
+ /I. \
+ /I$(TOPDIR)\include\crawler \
+ /I$(TOPDIR)\include\logger \
+ /I$(TOPDIR)\include\util
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)\src\logger\logger.lib
+
+CPP_OBJS = \
+ win32\errormsg.dllobj \
+ win32\stringutils.dllobj \
+ URL.dllobj \
+ MIMEType.dllobj \
+ SpoolRewindInputStream.dllobj
+
+DYNAMIC_LIB = \
+ crawler.dll
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+$(DYNAMIC_LIB): $(CPP_OBJS)
+ $(LINK) /nologo /dll /out:$@ $(LDFLAGS) $(LIBS) $?
+
+local_all: $(DYNAMIC_LIB)
+
+local_clean:
+ @-erase $(DYNAMIC_LIB) 2>NUL
+ @-erase win32\*.obj 2>NUL
+
+local_distclean:
+
+local_test:
diff --git a/src/libcrawler/SpoolRewindInputStream.cpp b/src/libcrawler/SpoolRewindInputStream.cpp
new file mode 100644
index 0000000..9135741
--- /dev/null
+++ b/src/libcrawler/SpoolRewindInputStream.cpp
@@ -0,0 +1,181 @@
+#include "SpoolRewindInputStream.hpp"
+#include "Logger.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <cassert>
+
+using namespace std;
+
+spool_streambuf::spool_streambuf( size_t bufSize, size_t putBack, size_t spoolBufSize )
+ : m_putBack( max( putBack, size_t( 1 ) ) ),
+ m_spoolBuf( spoolBufSize ), m_spoolBufPos( 0 ),
+ m_spoolBufSize( spoolBufSize ), m_state( TO_SPOOL_MEMORY ),
+ m_buf( max( bufSize, putBack ) + putBack ),
+ m_base( 0 ), m_start( 0 )
+{
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+}
+
+spool_streambuf::~spool_streambuf( )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ case FROM_SPOOL_MEMORY:
+ // memory only, nothing to clean up
+ break;
+
+ case TO_SPOOL_FILE:
+ case FROM_SPOOL_FILE:
+ m_spoolFile.close( );
+ (void)remove( "/tmp/spool.tmp" );
+ break;
+ }
+}
+
+streambuf::int_type spool_streambuf::underflow( )
+{
+ // check if buffer is exhausted, if not, return current character
+ if( gptr( ) < egptr( ) )
+ return traits_type::to_int_type( *gptr( ) );
+
+ m_base = &m_buf.front( );
+ m_start = m_base;
+
+ // move put back away
+ if( eback( ) == m_base ) {
+ memmove( m_base, egptr( ) - m_putBack, m_putBack );
+ m_start += m_putBack;
+ }
+
+ // read from source or spool (depends on calling rewind)
+ streambuf::int_type n;
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ case TO_SPOOL_FILE:
+ n = readFromSource( );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ } else if( n < 0 ) {
+ // TODO handle error
+ return traits_type::eof( );
+ }
+
+ if( m_state == TO_SPOOL_MEMORY ) {
+ // as long we can "spool" to memory, do so..
+ if( m_spoolBufPos + n <= m_spoolBufSize ) {
+ m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n );
+ m_spoolBufPos += n;
+ } else {
+ // ..otherwise start spooling to disk, write
+ // current memory spool buffer first..
+ LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBufSize << ")";
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc );
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize );
+ assert( m_spoolFile.good( ) );
+ m_state = TO_SPOOL_FILE;
+ m_spoolFile.write( m_start, n );
+ assert( m_spoolFile.good( ) );
+ }
+ } else {
+ // we are appending to the spool file
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( m_start, n );
+ assert( m_spoolFile.good( ) );
+ }
+
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos );
+ if( n == 0 ) {
+ return traits_type::eof( );
+ }
+
+ copy( m_spoolBuf.begin( ) + m_spoolBufPos,
+ m_spoolBuf.begin( ) + m_spoolBufPos + n,
+ m_buf.begin( ) + ( m_start - m_base ) );
+
+ m_spoolBufPos += n;
+
+ break;
+
+ case FROM_SPOOL_FILE:
+
+ n = min( m_buf.size( ) - ( m_start - m_base ), m_spoolBufSize - m_spoolBufPos );
+ m_spoolFile.read( m_start, n );
+ m_spoolBufPos += n;
+ if( m_spoolBufPos > m_spoolBufSize ) {
+ return traits_type::eof( );
+ }
+ if( n == 0 || m_spoolFile.eof( ) ) {
+ return traits_type::eof( );
+ }
+
+ break;
+ }
+
+ // set pointers
+ setg( m_base, m_start, m_start + n );
+
+ return traits_type::to_int_type( *gptr( ) );
+}
+
+void spool_streambuf::rewind( )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ m_spoolBufPos = 0;
+ m_state = FROM_SPOOL_MEMORY;
+ break;
+
+ case TO_SPOOL_FILE:
+ m_spoolFile.close( );
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::in );
+ m_spoolFile.seekg( 0, ios::end );
+ m_spoolBufSize = m_spoolFile.tellg( );
+ m_spoolFile.seekg( 0, ios::beg );
+ m_spoolBufPos = 0;
+ m_state = FROM_SPOOL_FILE;
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ m_spoolBufPos = 0;
+ break;
+
+ case FROM_SPOOL_FILE:
+ m_spoolBufPos = 0;
+ m_spoolFile.seekg( 0, ios::beg );
+ break;
+ }
+
+ char *end = &m_buf.front( ) + m_buf.size( );
+ setg( end, end, end );
+ pubseekpos( 0, ios_base::in );
+}
+
+SpoolRewindInputStream::SpoolRewindInputStream( const URL &url )
+ : RewindInputStream( url ), m_buf( 0 )
+{
+}
+
+SpoolRewindInputStream::~SpoolRewindInputStream( )
+{
+}
+
+void SpoolRewindInputStream::rewind( )
+{
+ // consume rest of web request, force spooling in streambuf
+ enum { CHUNKSIZE = 1024 };
+ char buf[CHUNKSIZE];
+
+ while( good( ) && !eof( ) ) {
+ read( buf, CHUNKSIZE );
+ }
+
+ ios::clear( );
+ assert( m_buf != 0 );
+ m_buf->rewind( );
+}
diff --git a/src/libcrawler/URL.cpp b/src/libcrawler/URL.cpp
new file mode 100644
index 0000000..f208500
--- /dev/null
+++ b/src/libcrawler/URL.cpp
@@ -0,0 +1,5 @@
+#include "URL.hpp"
+
+URL URL::Null;
+
+
diff --git a/src/libcrawler/win32/errormsg.cpp b/src/libcrawler/win32/errormsg.cpp
new file mode 100755
index 0000000..c0a65d8
--- /dev/null
+++ b/src/libcrawler/win32/errormsg.cpp
@@ -0,0 +1,27 @@
+#include "win32/errormsg.hpp"
+
+using namespace std;
+
+#define WIN32_MEAN_AND_LEAN
+#include <windows.h>
+
+string getLastError( )
+{
+ LPTSTR buf;
+ DWORD size;
+
+ DWORD lastErr = GetLastError( );
+
+ if( !FormatMessage(
+ FORMAT_MESSAGE_ALLOCATE_BUFFER |
+ FORMAT_MESSAGE_FROM_SYSTEM |
+ FORMAT_MESSAGE_IGNORE_INSERTS |
+ FORMAT_MESSAGE_MAX_WIDTH_MASK,
+ NULL, lastErr, 0, (LPTSTR)&buf,
+ 0, NULL ) ) {
+ return "<no message available>";
+ }
+
+ return string( buf );
+}
+
diff --git a/src/libcrawler/win32/stringutils.cpp b/src/libcrawler/win32/stringutils.cpp
new file mode 100755
index 0000000..607735c
--- /dev/null
+++ b/src/libcrawler/win32/stringutils.cpp
@@ -0,0 +1,21 @@
+#include "win32/stringutils.hpp"
+
+using namespace std;
+
+#define WIN32_MEAN_AND_LEAN
+#include <windows.h>
+
+std::wstring s2ws( const std::string &s )
+{
+ // get size for buffer and allocate it
+ int len;
+ int slength = (int)s.length( )+1;
+ len = MultiByteToWideChar( CP_ACP, 0, s.c_str( ), slength, 0, 0 );
+ wchar_t *buf = new wchar_t[len];
+
+ // convert
+ MultiByteToWideChar( CP_ACP, 0, s.c_str( ), slength, buf, len );
+ std::wstring res( buf );
+ delete[] buf;
+ return res;
+}