diff options
author | Andreas Baumann <mail@andreasbaumann.cc> | 2014-10-03 17:58:36 +0200 |
---|---|---|
committer | Andreas Baumann <mail@andreasbaumann.cc> | 2014-10-03 17:58:36 +0200 |
commit | 92ba06d58475fd4ab07d8e3b1efa6993f1f02340 (patch) | |
tree | 385a511835136fb2d190df05651b03c015690e91 | |
parent | ee52b3eab8cc7feb49fa6db964b94b35e2bc8bac (diff) | |
download | crawler-92ba06d58475fd4ab07d8e3b1efa6993f1f02340.tar.gz crawler-92ba06d58475fd4ab07d8e3b1efa6993f1f02340.tar.bz2 |
added an experimental curl fetcher
-rwxr-xr-x | include/crawler/SpoolRewindInputStream.hpp | 2 | ||||
-rw-r--r-- | makefiles/gmake/help.mk | 1 | ||||
-rw-r--r-- | makefiles/gmake/platform.mk | 56 | ||||
-rwxr-xr-x | src/crawl/crawl.cpp | 6 | ||||
-rw-r--r-- | src/libcrawler/SpoolRewindInputStream.cpp | 82 | ||||
-rw-r--r-- | src/modules/fetcher/GNUmakefile | 2 | ||||
-rw-r--r-- | tests/GNUmakefile | 6 | ||||
-rw-r--r-- | tests/libcurlpp/GNUmakefile (renamed from tests/curl/GNUmakefile) | 0 | ||||
-rw-r--r-- | tests/libcurlpp/README (renamed from tests/curl/README) | 0 | ||||
-rwxr-xr-x | tests/libcurlpp/exec_test (renamed from tests/curl/exec_test) | 0 | ||||
-rw-r--r-- | tests/libcurlpp/test1.MUST (renamed from tests/curl/test1.MUST) | 0 | ||||
-rw-r--r-- | tests/libcurlpp/test1.cpp (renamed from tests/curl/test1.cpp) | 0 | ||||
-rw-r--r-- | tests/libcurlpp/test2.MUST (renamed from tests/curl/test2.MUST) | 0 | ||||
-rw-r--r-- | tests/libcurlpp/test2.cpp (renamed from tests/curl/test2.cpp) | 2 |
14 files changed, 128 insertions, 29 deletions
diff --git a/include/crawler/SpoolRewindInputStream.hpp b/include/crawler/SpoolRewindInputStream.hpp index f065271..523c1b6 100755 --- a/include/crawler/SpoolRewindInputStream.hpp +++ b/include/crawler/SpoolRewindInputStream.hpp @@ -18,9 +18,11 @@ class spool_streambuf : public std::streambuf protected: CRAWLER_DLL_VISIBLE virtual std::streambuf::int_type readFromSource( ) = 0; + CRAWLER_DLL_VISIBLE std::streambuf::int_type spoolSourceData( char *data, size_t n ); private: CRAWLER_DLL_VISIBLE int_type underflow( ); + CRAWLER_DLL_VISIBLE void spoolData( size_t n ); private: const size_t m_putBack; diff --git a/makefiles/gmake/help.mk b/makefiles/gmake/help.mk index c8d855e..db834a1 100644 --- a/makefiles/gmake/help.mk +++ b/makefiles/gmake/help.mk @@ -45,6 +45,7 @@ WITH_ICU=1 enable ICU support for URL parsing in Google URL scripting support: WITH_LUA=1 use Lua for configuration and scripting +WITH_CURL=1 use libcurl and libcurlpp and generate a Curl fetcher Some more obscure options: diff --git a/makefiles/gmake/platform.mk b/makefiles/gmake/platform.mk index ad29eb9..8cafc3d 100644 --- a/makefiles/gmake/platform.mk +++ b/makefiles/gmake/platform.mk @@ -102,7 +102,7 @@ EXE = endif # extensions for shared libraries -# (TOOD: HP/Unix has .shlib, Mac/X has .lib, but we can't test it currently) +# (TOOD: HP/Unix has .shlib, Mac/X has .dylib, but we can't test it currently) SO = .so # name if the installation program @@ -304,3 +304,57 @@ endif endif endif + +# curl and curlpp +################# + +ifeq ($(WITH_CURL),1) + +ifeq "$(PLATFORM)" "LINUX" + +CURL_INCLUDES ?= +CURL_LDFLAGS ?= +CURL_LIBS ?= -lcurl -lcurlpp + +endif + +ifeq "$(PLATFORM)" "SUNOS" +ifeq "$(OS_MAJOR_VERSION)" "5" +ifeq "$(OS_MINOR_VERSION)" "10" +CURL_INCLUDES ?= -I/opt/csw/include +CURL_LDFLAGS ?= -L/opt/csw/lib +CURL_LIBS ?= -lcurl -lcurlpp +endif +endif +endif + +ifeq "$(PLATFORM)" "FREEBSD" +ifeq "$(OS_MAJOR_VERSION)" "8" +CURL_INCLUDES ?= -I/usr/local/include +CURL_LDFLAGS ?= -L/usr/local/lib +CURL_LIBS ?= -lcurl -lcurlpp +endif +ifeq "$(OS_MAJOR_VERSION)" "9" +CURL_INCLUDES ?= -I/usr/local/include +CURL_LDFLAGS ?= -L/usr/local/lib +CURL_LIBS ?= -lcurl -lcurlpp +endif +endif + +ifeq "$(PLATFORM)" "OPENBSD" +ifeq "$(OS_MAJOR_VERSION)" "5" +CURL_INCLUDES ?= -I/usr/local/include +CURL_LDFLAGS ?= -L/usr/local/lib +CURL_LIBS ?= -lcurl -lcurlpp +endif +endif + +ifeq "$(PLATFORM)" "NETBSD" +ifeq "$(OS_MAJOR_VERSION)" "5" +CURL_INCLUDES ?= -I/usr/pkg/include +CURL_LDFLAGS ?= -L/usr/pkg/lib +CURL_LIBS ?= -lcurl -lcurlpp +endif +endif + +endif diff --git a/src/crawl/crawl.cpp b/src/crawl/crawl.cpp index 4f3eb00..9f5e0b2 100755 --- a/src/crawl/crawl.cpp +++ b/src/crawl/crawl.cpp @@ -120,7 +120,8 @@ int main( int /* argc */, char *argv[] ) vector<string> fetcherModules; #ifndef _WIN32 - fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); +// fetcherModules.push_back( "./modules/fetcher/libfetch/mod_fetcher_libfetch.so" ); + fetcherModules.push_back( "./modules/fetcher/libcurl/mod_fetcher_libcurl.so" ); #else fetcherModules.push_back( ".\\modules\\fetcher\\winhttp\\mod_fetcher_winhttp.dll" ); #endif @@ -162,7 +163,8 @@ int main( int /* argc */, char *argv[] ) Frontier *frontier = frontiers.create( "memory_frontier" ); #ifndef _WIN32 - Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); +// Fetcher *fetcher = fetchers.create( "libfetch_fetcher" ); + Fetcher *fetcher = fetchers.create( "libcurl_fetcher" ); #else Fetcher *fetcher = fetchers.create( "winhttp_fetcher" ); #endif diff --git a/src/libcrawler/SpoolRewindInputStream.cpp b/src/libcrawler/SpoolRewindInputStream.cpp index 9135741..13ab105 100644 --- a/src/libcrawler/SpoolRewindInputStream.cpp +++ b/src/libcrawler/SpoolRewindInputStream.cpp @@ -34,6 +34,61 @@ spool_streambuf::~spool_streambuf( ) } } +streambuf::int_type spool_streambuf::spoolSourceData( char *data, size_t n ) +{ + size_t data_len = m_buf.size( ) - ( m_start - m_base ) ; + if( n < data_len ) { + data_len = n; + } + + m_base = &m_buf.front( ); + m_start = m_base; + + memcpy( m_start, data, data_len ); + + spoolData( data_len ); + + return data_len; +} + +void spool_streambuf::spoolData( size_t n ) +{ + switch( m_state ) { + case TO_SPOOL_MEMORY: + // as long we can "spool" to memory, do so.. + if( m_spoolBufPos + n <= m_spoolBufSize ) { + m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n ); + m_spoolBufPos += n; + } else { + // ..otherwise start spooling to disk, write + // current memory spool buffer first.. + LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBufSize << ")"; + m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc ); + assert( m_spoolFile.good( ) ); + m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize ); + assert( m_spoolFile.good( ) ); + m_state = TO_SPOOL_FILE; + m_spoolFile.write( m_start, n ); + assert( m_spoolFile.good( ) ); + } + break; + + case TO_SPOOL_FILE: + // we are appending to the spool file + assert( m_spoolFile.good( ) ); + m_spoolFile.write( m_start, n ); + assert( m_spoolFile.good( ) ); + break; + + case FROM_SPOOL_MEMORY: + case FROM_SPOOL_FILE: + throw logic_error( "Still getting data from source after rewind!" ); + + default: + throw logic_error( "Illegal state!" ); + } +} + streambuf::int_type spool_streambuf::underflow( ) { // check if buffer is exhausted, if not, return current character @@ -62,29 +117,7 @@ streambuf::int_type spool_streambuf::underflow( ) return traits_type::eof( ); } - if( m_state == TO_SPOOL_MEMORY ) { - // as long we can "spool" to memory, do so.. - if( m_spoolBufPos + n <= m_spoolBufSize ) { - m_spoolBuf.insert( m_spoolBuf.begin( ) + m_spoolBufPos, m_start, m_start + n ); - m_spoolBufPos += n; - } else { - // ..otherwise start spooling to disk, write - // current memory spool buffer first.. - LOG( logWARNING ) << "Spooling spool buffer exceeded (>" << m_spoolBufSize << ")"; - m_spoolFile.open( "/tmp/spool.tmp", ios::binary | ios::out | ios::trunc ); - assert( m_spoolFile.good( ) ); - m_spoolFile.write( &m_spoolBuf.front( ), m_spoolBufSize ); - assert( m_spoolFile.good( ) ); - m_state = TO_SPOOL_FILE; - m_spoolFile.write( m_start, n ); - assert( m_spoolFile.good( ) ); - } - } else { - // we are appending to the spool file - assert( m_spoolFile.good( ) ); - m_spoolFile.write( m_start, n ); - assert( m_spoolFile.good( ) ); - } + spoolData( n ); break; @@ -115,6 +148,9 @@ streambuf::int_type spool_streambuf::underflow( ) } break; + + default: + throw logic_error( "Illegal state!" ); } // set pointers diff --git a/src/modules/fetcher/GNUmakefile b/src/modules/fetcher/GNUmakefile index 89dfe93..6be8eaa 100644 --- a/src/modules/fetcher/GNUmakefile +++ b/src/modules/fetcher/GNUmakefile @@ -1,6 +1,6 @@ TOPDIR = ../../.. -SUBDIRS = libfetch file +SUBDIRS = libfetch file libcurl -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/tests/GNUmakefile b/tests/GNUmakefile index a21e409..ae574dd 100644 --- a/tests/GNUmakefile +++ b/tests/GNUmakefile @@ -1,9 +1,13 @@ TOPDIR = .. SUBDIRS = \ - utils logger modules url streamhtmlparser libfetch curl psql sqlite typedetect \ + utils logger modules url streamhtmlparser libfetch psql sqlite typedetect \ fetcher textwolf +ifeq ($(WITH_CURL),1) +SUBDIRS += libcurl +endif + -include $(TOPDIR)/makefiles/gmake/sub.mk local_all: diff --git a/tests/curl/GNUmakefile b/tests/libcurlpp/GNUmakefile index de0462a..de0462a 100644 --- a/tests/curl/GNUmakefile +++ b/tests/libcurlpp/GNUmakefile diff --git a/tests/curl/README b/tests/libcurlpp/README index 747e9a6..747e9a6 100644 --- a/tests/curl/README +++ b/tests/libcurlpp/README diff --git a/tests/curl/exec_test b/tests/libcurlpp/exec_test index fce8214..fce8214 100755 --- a/tests/curl/exec_test +++ b/tests/libcurlpp/exec_test diff --git a/tests/curl/test1.MUST b/tests/libcurlpp/test1.MUST index 08839f6..08839f6 100644 --- a/tests/curl/test1.MUST +++ b/tests/libcurlpp/test1.MUST diff --git a/tests/curl/test1.cpp b/tests/libcurlpp/test1.cpp index d10ef02..d10ef02 100644 --- a/tests/curl/test1.cpp +++ b/tests/libcurlpp/test1.cpp diff --git a/tests/curl/test2.MUST b/tests/libcurlpp/test2.MUST index 08839f6..08839f6 100644 --- a/tests/curl/test2.MUST +++ b/tests/libcurlpp/test2.MUST diff --git a/tests/curl/test2.cpp b/tests/libcurlpp/test2.cpp index 748da14..983f1bd 100644 --- a/tests/curl/test2.cpp +++ b/tests/libcurlpp/test2.cpp @@ -60,7 +60,7 @@ int main( int argc, char *argv[] ) { MemoryWriter w; if( argc != 2 ) { - cerr << "usage: test1 <url>" << endl; + cerr << "usage: test2 <url>" << endl; return EXIT_FAILURE; } url = argv[1]; |