summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <mail@andreasbaumann.cc>2014-10-03 17:58:36 +0200
committerAndreas Baumann <mail@andreasbaumann.cc>2014-10-03 17:58:36 +0200
commit92ba06d58475fd4ab07d8e3b1efa6993f1f02340 (patch)
tree385a511835136fb2d190df05651b03c015690e91
parentee52b3eab8cc7feb49fa6db964b94b35e2bc8bac (diff)
downloadcrawler-92ba06d58475fd4ab07d8e3b1efa6993f1f02340.tar.gz
crawler-92ba06d58475fd4ab07d8e3b1efa6993f1f02340.tar.bz2
added an experimental curl fetcher
-rwxr-xr-xinclude/crawler/SpoolRewindInputStream.hpp2
-rw-r--r--makefiles/gmake/help.mk1
-rw-r--r--makefiles/gmake/platform.mk56
-rwxr-xr-xsrc/crawl/crawl.cpp6
-rw-r--r--src/libcrawler/SpoolRewindInputStream.cpp82
-rw-r--r--src/modules/fetcher/GNUmakefile2
-rw-r--r--tests/GNUmakefile6
-rw-r--r--tests/libcurlpp/GNUmakefile (renamed from tests/curl/GNUmakefile)0
-rw-r--r--tests/libcurlpp/README (renamed from tests/curl/README)0
-rwxr-xr-xtests/libcurlpp/exec_test (renamed from tests/curl/exec_test)0
-rw-r--r--tests/libcurlpp/test1.MUST (renamed from tests/curl/test1.MUST)0
-rw-r--r--tests/libcurlpp/test1.cpp (renamed from tests/curl/test1.cpp)0
-rw-r--r--tests/libcurlpp/test2.MUST (renamed from tests/curl/test2.MUST)0
-rw-r--r--tests/libcurlpp/test2.cpp (renamed from tests/curl/test2.cpp)2
14 files changed, 128 insertions, 29 deletions
diff --git a/include/crawler/SpoolRewindInputStream.hpp b/include/crawler/SpoolRewindInputStream.hpp
index f065271..523c1b6 100755
--- a/include/crawler/SpoolRewindInputStream.hpp
+++ b/include/crawler/SpoolRewindInputStream.hpp
@@ -18,9 +18,11 @@ class spool_streambuf : public std::streambuf
protected:
CRAWLER_DLL_VISIBLE virtual std::streambuf::int_type readFromSource( ) = 0;
+ CRAWLER_DLL_VISIBLE std::streambuf::int_type spoolSourceData( char *data, size_t n );
private:
CRAWLER_DLL_VISIBLE int_type underflow( );
+ CRAWLER_DLL_VISIBLE void spoolData( size_t n );
private:
const size_t m_putBack;
diff --git a/makefiles/gmake/help.mk b/makefiles/gmake/help.mk
index c8d855e..db834a1 100644
--- a/makefiles/gmake/help.mk
+++ b/makefiles/gmake/help.mk
@@ -45,6 +45,7 @@ WITH_ICU=1 enable ICU support for URL parsing in Google URL
scripting support:
WITH_LUA=1 use Lua for configuration and scripting
+WITH_CURL=1 use libcurl and libcurlpp and generate a Curl fetcher
Some more obscure options:
diff --git a/makefiles/gmake/platform.mk b/makefiles/gmake/platform.mk
index ad29eb9..8cafc3d 100644
--- a/makefiles/gmake/platform.mk
+++ b/makefiles/gmake/platform.mk
@@ -102,7 +102,7 @@ EXE =
endif
# extensions for shared libraries
-# (TOOD: HP/Unix has .shlib, Mac/X has .lib, but we can't test it currently)
+# (TOOD: HP/Unix has .shlib, Mac/X has .dylib, but we can't test it currently)
SO = .so
# name if the installation program
@@ -304,3 +304,57 @@ endif
endif
endif
+
+# curl and curlpp
+#################
+
+ifeq ($(WITH_CURL),1)
+
+ifeq "$(PLATFORM)" "LINUX"
+
+CURL_INCLUDES ?=
+CURL_LDFLAGS ?=
+CURL_LIBS ?= -lcurl -lcurlpp
+
+endif
+
+ifeq "$(PLATFORM)" "SUNOS"
+ifeq "$(OS_MAJOR_VERSION)" "5"
+ifeq "$(OS_MINOR_VERSION)" "10"
+CURL_INCLUDES ?= -I/opt/csw/include
+CURL_LDFLAGS ?= -L/opt/csw/lib
+CURL_LIBS ?= -lcurl -lcurlpp
+endif
+endif
+endif
+
+ifeq "$(PLATFORM)" "FREEBSD"
+ifeq "$(OS_MAJOR_VERSION)" "8"
+CURL_INCLUDES ?= -I/usr/local/include
+CURL_LDFLAGS ?= -L/usr/local/lib
+CURL_LIBS ?= -lcurl -lcurlpp
+endif
+ifeq "$(OS_MAJOR_VERSION)" "9"
+CURL_INCLUDES ?= -I/usr/local/include
+CURL_LDFLAGS ?= -L/usr/local/lib
+CURL_LIBS ?= -lcurl -lcurlpp
+endif
+endif
+
+ifeq "$(PLATFORM)" "OPENBSD"
+ifeq "$(OS_MAJOR_VERSION)" "5"
+CURL_INCLUDES ?= -I/usr/local/include
+CURL_LDFLAGS ?= -L/usr/local/lib
+CURL_LIBS ?= -lcurl -lcurlpp
+endif
+endif
+
+ifeq "$(PLATFORM)" "NETBSD"
+ifeq "$(OS_MAJOR_VERSION)" "5"
+CURL_INCLUDES ?= -I/usr/pkg/include
+CURL_LDFLAGS ?= -L/usr/pkg/lib
+CURL_LIBS ?= -lcurl -lcurlpp
+endif
+endif
+
+endif
diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp
index 4f3eb00..9f5e0b2 100755
--- a/src/crawl/crawl.cpp
+++ b/src/crawl/crawl.cpp
@@ -120,7 +120,8 @@ int main( int /* argc */, char *argv[] )
vector<string> fetcherModules;
#ifndef _WIN32
- fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" );
+// fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" );
+ fetcherModules.push_back( "./modules/fetcher/libcurl/mod_fetcher_libcurl.so" );
#else
fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" );
#endif
@@ -162,7 +163,8 @@ int main( int /* argc */, char *argv[] )
Frontier *frontier = frontiers.create( "memory_frontier" );
#ifndef _WIN32
- Fetcher *fetcher = fetchers.create( "libfetch_fetcher" );
+// Fetcher *fetcher = fetchers.create( "libfetch_fetcher" );
+ Fetcher *fetcher = fetchers.create( "libcurl_fetcher" );
#else
Fetcher *fetcher = fetchers.create( "winhttp_fetcher" );
#endif
diff --git a/src/libcrawler/SpoolRewindInputStream.cpp b/src/libcrawler/SpoolRewindInputStream.cpp
index 9135741..13ab105 100644
--- a/src/libcrawler/SpoolRewindInputStream.cpp
+++ b/src/libcrawler/SpoolRewindInputStream.cpp
@@ -34,6 +34,61 @@ spool_streambuf::~spool_streambuf( )
}
}
+streambuf::int_type spool_streambuf::spoolSourceData( char *data, size_t n )
+{
+ size_t data_len = m_buf.size( ) - ( m_start - m_base ) ;
+ if( n < data_len ) {
+ data_len = n;
+ }
+
+ m_base = &m_buf.front( );
+ m_start = m_base;
+
+ memcpy( m_start, data, data_len );
+
+ spoolData( data_len );
+
+ return data_len;
+}
+
+void spool_streambuf::spoolData( size_t n )
+{
+ switch( m_state ) {
+ case TO_SPOOL_MEMORY:
+ // as long we can "spool" to memory, do so..
+ if( m_spoolBufPos + n <= m_spoolBufSize ) {
+ m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n );
+ m_spoolBufPos += n;
+ } else {
+ // ..otherwise start spooling to disk, write
+ // current memory spool buffer first..
+ LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBufSize << ")";
+ m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc );
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize );
+ assert( m_spoolFile.good( ) );
+ m_state = TO_SPOOL_FILE;
+ m_spoolFile.write( m_start, n );
+ assert( m_spoolFile.good( ) );
+ }
+ break;
+
+ case TO_SPOOL_FILE:
+ // we are appending to the spool file
+ assert( m_spoolFile.good( ) );
+ m_spoolFile.write( m_start, n );
+ assert( m_spoolFile.good( ) );
+ break;
+
+ case FROM_SPOOL_MEMORY:
+ case FROM_SPOOL_FILE:
+ throw logic_error( "Still getting data from source after rewind!" );
+
+ default:
+ throw logic_error( "Illegal state!" );
+ }
+}
+
streambuf::int_type spool_streambuf::underflow( )
{
// check if buffer is exhausted, if not, return current character
@@ -62,29 +117,7 @@ streambuf::int_type spool_streambuf::underflow( )
return traits_type::eof( );
}
- if( m_state == TO_SPOOL_MEMORY ) {
- // as long we can "spool" to memory, do so..
- if( m_spoolBufPos + n <= m_spoolBufSize ) {
- m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n );
- m_spoolBufPos += n;
- } else {
- // ..otherwise start spooling to disk, write
- // current memory spool buffer first..
- LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBufSize << ")";
- m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc );
- assert( m_spoolFile.good( ) );
- m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize );
- assert( m_spoolFile.good( ) );
- m_state = TO_SPOOL_FILE;
- m_spoolFile.write( m_start, n );
- assert( m_spoolFile.good( ) );
- }
- } else {
- // we are appending to the spool file
- assert( m_spoolFile.good( ) );
- m_spoolFile.write( m_start, n );
- assert( m_spoolFile.good( ) );
- }
+ spoolData( n );
break;
@@ -115,6 +148,9 @@ streambuf::int_type spool_streambuf::underflow( )
}
break;
+
+ default:
+ throw logic_error( "Illegal state!" );
}
// set pointers
diff --git a/src/modules/fetcher/GNUmakefile b/src/modules/fetcher/GNUmakefile
index 89dfe93..6be8eaa 100644
--- a/src/modules/fetcher/GNUmakefile
+++ b/src/modules/fetcher/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = ../../..
-SUBDIRS = libfetch file
+SUBDIRS = libfetch file libcurl
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/tests/GNUmakefile b/tests/GNUmakefile
index a21e409..ae574dd 100644
--- a/tests/GNUmakefile
+++ b/tests/GNUmakefile
@@ -1,9 +1,13 @@
TOPDIR = ..
SUBDIRS = \
- utils logger modules url streamhtmlparser libfetch curl psql sqlite typedetect \
+ utils logger modules url streamhtmlparser libfetch psql sqlite typedetect \
fetcher textwolf
+ifeq ($(WITH_CURL),1)
+SUBDIRS += libcurl
+endif
+
-include $(TOPDIR)/makefiles/gmake/sub.mk
local_all:
diff --git a/tests/curl/GNUmakefile b/tests/libcurlpp/GNUmakefile
index de0462a..de0462a 100644
--- a/tests/curl/GNUmakefile
+++ b/tests/libcurlpp/GNUmakefile
diff --git a/tests/curl/README b/tests/libcurlpp/README
index 747e9a6..747e9a6 100644
--- a/tests/curl/README
+++ b/tests/libcurlpp/README
diff --git a/tests/curl/exec_test b/tests/libcurlpp/exec_test
index fce8214..fce8214 100755
--- a/tests/curl/exec_test
+++ b/tests/libcurlpp/exec_test
diff --git a/tests/curl/test1.MUST b/tests/libcurlpp/test1.MUST
index 08839f6..08839f6 100644
--- a/tests/curl/test1.MUST
+++ b/tests/libcurlpp/test1.MUST
diff --git a/tests/curl/test1.cpp b/tests/libcurlpp/test1.cpp
index d10ef02..d10ef02 100644
--- a/tests/curl/test1.cpp
+++ b/tests/libcurlpp/test1.cpp
diff --git a/tests/curl/test2.MUST b/tests/libcurlpp/test2.MUST
index 08839f6..08839f6 100644
--- a/tests/curl/test2.MUST
+++ b/tests/libcurlpp/test2.MUST
diff --git a/tests/curl/test2.cpp b/tests/libcurlpp/test2.cpp
index 748da14..983f1bd 100644
--- a/tests/curl/test2.cpp
+++ b/tests/libcurlpp/test2.cpp
@@ -60,7 +60,7 @@ int main( int argc, char *argv[] ) {
MemoryWriter w;
if( argc != 2 ) {
- cerr << "usage: test1 <url>" << endl;
+ cerr << "usage: test2 <url>" << endl;
return EXIT_FAILURE;
}
url = argv[1];