diff options
-rw-r--r-- | docs/LINKS | 4 | ||||
-rw-r--r-- | makefiles/gmake/help.mk | 3 | ||||
-rw-r--r-- | src/GNUmakefile | 16 | ||||
-rw-r--r-- | src/LibFetchFetcher.cpp | 12 | ||||
-rw-r--r-- | src/LibFetchFetcher.hpp | 5 | ||||
-rw-r--r-- | src/LibFetchRewindInputStream.cpp | 6 | ||||
-rw-r--r-- | src/LibFetchRewindInputStream.hpp | 17 | ||||
-rw-r--r-- | src/URL.hpp | 12 | ||||
-rw-r--r-- | src/crawlingwolf.cpp | 6 | ||||
-rw-r--r-- | tests/libfetch/GNUmakefile | 3 | ||||
-rw-r--r-- | tests/libfetch/test1.c | 5 |
11 files changed, 78 insertions, 11 deletions
@@ -21,3 +21,7 @@ https://github.com/joshfire/node-crawler Php http://www.makeuseof.com/tag/build-basic-web-crawler-pull-information-website/ + +Streams + +http://www.mr-edd.co.uk/blog/beginners_guide_streambuf diff --git a/makefiles/gmake/help.mk b/makefiles/gmake/help.mk index 2b5f07a..b996514 100644 --- a/makefiles/gmake/help.mk +++ b/makefiles/gmake/help.mk @@ -39,4 +39,5 @@ Some more obscure options: ENABLE_NLS=0 Don't build gettext NLS support (default is on) Example: -make WITH_SSL=1 WITH_SQLITE3=1 WITH_PGSQL=1 WITH_LOCAL_LIBFETCH=1 WITH_LIBXML2=1 +make WITH_SSL=1 WITH_SQLITE3=1 WITH_PGSQL=1 \ + WITH_LOCAL_LIBFETCH=1 WITH_LIBXML2=1 diff --git a/src/GNUmakefile b/src/GNUmakefile index b60a865..d9c8234 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -4,17 +4,31 @@ SUBDIRS = -include $(TOPDIR)/makefiles/gmake/platform.mk -INCLUDE_CXXFLAGS = +INCLUDE_CPPFLAGS = \ INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ + -I$(TOPDIR)/libfetch INCLUDE_LIBS = \ + $(TOPDIR)/libfetch/libfetch.a + +# openssl +ifeq ($(WITH_SSL),1) + +INCLUDE_CFLAGS += \ + -DWITH_SSL + +INCLUDE_LIBS += \ + $(OPENSSL_LIBS) +endif CPP_OBJS = \ URL.o \ Fetcher.o \ + LibFetchFetcher.o \ + LibFetchRewindInputStream.o \ Frontier.o \ Deduper.o diff --git a/src/LibFetchFetcher.cpp b/src/LibFetchFetcher.cpp new file mode 100644 index 0000000..f9f6a1f --- /dev/null +++ b/src/LibFetchFetcher.cpp @@ -0,0 +1,12 @@ +#include "LibFetchFetcher.hpp" +#include "LibFetchRewindInputStream.hpp" + +#include "fetch.h" + +RewindInputStream LibFetchFetcher::fetch( const URL& url ) +{ + LibFetchRewindInputStream s( url ); + + (void)url; + return s; +} diff --git a/src/LibFetchFetcher.hpp b/src/LibFetchFetcher.hpp index 4f1610c..23426da 100644 --- a/src/LibFetchFetcher.hpp +++ b/src/LibFetchFetcher.hpp @@ -12,10 +12,7 @@ class LibFetchFetcher : public Fetcher virtual ~LibFetchFetcher( ) { } - virtual RewindInputStream fetch( const URL& url ) { - (void)url; - return RewindInputStream( ); - } + virtual RewindInputStream fetch( const URL& url ); }; #endif diff --git a/src/LibFetchRewindInputStream.cpp b/src/LibFetchRewindInputStream.cpp new file mode 100644 index 0000000..743bca1 --- /dev/null +++ b/src/LibFetchRewindInputStream.cpp @@ -0,0 +1,6 @@ +#include "LibFetchRewindInputStream.hpp" + +LibFetchRewindInputStream::LibFetchRewindInputStream( const URL& url ) +{ + (void)url; +} diff --git a/src/LibFetchRewindInputStream.hpp b/src/LibFetchRewindInputStream.hpp new file mode 100644 index 0000000..cd5d3a6 --- /dev/null +++ b/src/LibFetchRewindInputStream.hpp @@ -0,0 +1,17 @@ +#ifndef __LIBFETCH_REWIND_INPUT_STREAM_H +#define __LIBFETCH_REWIND_INPUT_STREAM_H + +#include "RewindInputStream.hpp" +#include "URL.hpp" + +#include "fetch.h" + +class LibFetchRewindInputStream : public RewindInputStream { + public: + LibFetchRewindInputStream( const URL& url ); + + private: + fetchIO *io; +}; + +#endif diff --git a/src/URL.hpp b/src/URL.hpp index 29fecf8..fc653c5 100644 --- a/src/URL.hpp +++ b/src/URL.hpp @@ -2,6 +2,7 @@ #define __URL_H #include <string> +#include <iostream> using namespace std; @@ -35,9 +36,18 @@ class URL { bool operator<( const URL &other ) const { return m_url < other.m_url; } - + + template< typename CharT, typename TraitsT > friend + basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, URL& u ); + protected: string m_url; }; +template< typename CharT, typename TraitsT > +inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, URL& u ) { + s << u.m_url; + return s; +} + #endif diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp index 8e3e29a..bcecc50 100644 --- a/src/crawlingwolf.cpp +++ b/src/crawlingwolf.cpp @@ -9,14 +9,16 @@ int main( void ) Fetcher *fetcher = new LibFetchFetcher( ); Deduper *deduper = new MD5Deduper( ); - LOG( logINFO ) << "Crawler started.."; + LOG( logNOTICE ) << "Crawler started.."; frontier->addUrl( URL( "http://www.andreasbaumann.cc" ) ); URL url; while( ( url = frontier->getNextUrl( ) ) != URL::Null ) { + LOG( logINFO ) << "Got URL " << url; RewindInputStream s = fetcher->fetch( url ); if( deduper->contentSeen( url, s ) ) { + LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen"; continue; } } @@ -25,7 +27,7 @@ int main( void ) delete fetcher; delete frontier; - LOG( logINFO ) << "Crawler stopped.."; + LOG( logNOTICE ) << "Crawler stopped.."; return 0; } diff --git a/tests/libfetch/GNUmakefile b/tests/libfetch/GNUmakefile index a8d7495..6f6099f 100644 --- a/tests/libfetch/GNUmakefile +++ b/tests/libfetch/GNUmakefile @@ -23,6 +23,9 @@ endif TEST_BINS = \ test1$(EXE) +TEST_CPP_BINS = \ + test2$(EXE) + OBJS = -include $(TOPDIR)/makefiles/gmake/sub.mk diff --git a/tests/libfetch/test1.c b/tests/libfetch/test1.c index fd28563..f912e3a 100644 --- a/tests/libfetch/test1.c +++ b/tests/libfetch/test1.c @@ -2,13 +2,13 @@ #include <stdlib.h> #include <unistd.h> -#include <fetch.h> +#include "fetch.h" int main( int argc, char *argv[] ) { char *urlstring; fetchIO *io; - char buf[1024]; + char buf[256]; ssize_t res; if( argc != 2 ) { @@ -27,6 +27,7 @@ int main( int argc, char *argv[] ) } while( ( res = fetchIO_read( io, buf, sizeof( buf ) ) ) != 0 ) { + buf[res] = '\0'; puts( buf ); } |