summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/LINKS4
-rw-r--r--makefiles/gmake/help.mk3
-rw-r--r--src/GNUmakefile16
-rw-r--r--src/LibFetchFetcher.cpp12
-rw-r--r--src/LibFetchFetcher.hpp5
-rw-r--r--src/LibFetchRewindInputStream.cpp6
-rw-r--r--src/LibFetchRewindInputStream.hpp17
-rw-r--r--src/URL.hpp12
-rw-r--r--src/crawlingwolf.cpp6
-rw-r--r--tests/libfetch/GNUmakefile3
-rw-r--r--tests/libfetch/test1.c5
11 files changed, 78 insertions, 11 deletions
diff --git a/docs/LINKS b/docs/LINKS
index afa1082..e69ed4b 100644
--- a/docs/LINKS
+++ b/docs/LINKS
@@ -21,3 +21,7 @@ https://github.com/joshfire/node-crawler
Php
http://www.makeuseof.com/tag/build-basic-web-crawler-pull-information-website/
+
+Streams
+
+http://www.mr-edd.co.uk/blog/beginners_guide_streambuf
diff --git a/makefiles/gmake/help.mk b/makefiles/gmake/help.mk
index 2b5f07a..b996514 100644
--- a/makefiles/gmake/help.mk
+++ b/makefiles/gmake/help.mk
@@ -39,4 +39,5 @@ Some more obscure options:
ENABLE_NLS=0 Don't build gettext NLS support (default is on)
Example:
-make WITH_SSL=1 WITH_SQLITE3=1 WITH_PGSQL=1 WITH_LOCAL_LIBFETCH=1 WITH_LIBXML2=1
+make WITH_SSL=1 WITH_SQLITE3=1 WITH_PGSQL=1 \
+ WITH_LOCAL_LIBFETCH=1 WITH_LIBXML2=1
diff --git a/src/GNUmakefile b/src/GNUmakefile
index b60a865..d9c8234 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -4,17 +4,31 @@ SUBDIRS =
-include $(TOPDIR)/makefiles/gmake/platform.mk
-INCLUDE_CXXFLAGS =
+INCLUDE_CPPFLAGS = \
INCLUDE_LDFLAGS = \
INCLUDE_DIRS = \
+ -I$(TOPDIR)/libfetch
INCLUDE_LIBS = \
+ $(TOPDIR)/libfetch/libfetch.a
+
+# openssl
+ifeq ($(WITH_SSL),1)
+
+INCLUDE_CFLAGS += \
+ -DWITH_SSL
+
+INCLUDE_LIBS += \
+ $(OPENSSL_LIBS)
+endif
CPP_OBJS = \
URL.o \
Fetcher.o \
+ LibFetchFetcher.o \
+ LibFetchRewindInputStream.o \
Frontier.o \
Deduper.o
diff --git a/src/LibFetchFetcher.cpp b/src/LibFetchFetcher.cpp
new file mode 100644
index 0000000..f9f6a1f
--- /dev/null
+++ b/src/LibFetchFetcher.cpp
@@ -0,0 +1,12 @@
+#include "LibFetchFetcher.hpp"
+#include "LibFetchRewindInputStream.hpp"
+
+#include "fetch.h"
+
+RewindInputStream LibFetchFetcher::fetch( const URL& url )
+{
+ LibFetchRewindInputStream s( url );
+
+ (void)url;
+ return s;
+}
diff --git a/src/LibFetchFetcher.hpp b/src/LibFetchFetcher.hpp
index 4f1610c..23426da 100644
--- a/src/LibFetchFetcher.hpp
+++ b/src/LibFetchFetcher.hpp
@@ -12,10 +12,7 @@ class LibFetchFetcher : public Fetcher
virtual ~LibFetchFetcher( ) {
}
- virtual RewindInputStream fetch( const URL& url ) {
- (void)url;
- return RewindInputStream( );
- }
+ virtual RewindInputStream fetch( const URL& url );
};
#endif
diff --git a/src/LibFetchRewindInputStream.cpp b/src/LibFetchRewindInputStream.cpp
new file mode 100644
index 0000000..743bca1
--- /dev/null
+++ b/src/LibFetchRewindInputStream.cpp
@@ -0,0 +1,6 @@
+#include "LibFetchRewindInputStream.hpp"
+
+LibFetchRewindInputStream::LibFetchRewindInputStream( const URL& url )
+{
+ (void)url;
+}
diff --git a/src/LibFetchRewindInputStream.hpp b/src/LibFetchRewindInputStream.hpp
new file mode 100644
index 0000000..cd5d3a6
--- /dev/null
+++ b/src/LibFetchRewindInputStream.hpp
@@ -0,0 +1,17 @@
+#ifndef __LIBFETCH_REWIND_INPUT_STREAM_H
+#define __LIBFETCH_REWIND_INPUT_STREAM_H
+
+#include "RewindInputStream.hpp"
+#include "URL.hpp"
+
+#include "fetch.h"
+
+class LibFetchRewindInputStream : public RewindInputStream {
+ public:
+ LibFetchRewindInputStream( const URL& url );
+
+ private:
+ fetchIO *io;
+};
+
+#endif
diff --git a/src/URL.hpp b/src/URL.hpp
index 29fecf8..fc653c5 100644
--- a/src/URL.hpp
+++ b/src/URL.hpp
@@ -2,6 +2,7 @@
#define __URL_H
#include <string>
+#include <iostream>
using namespace std;
@@ -35,9 +36,18 @@ class URL {
bool operator<( const URL &other ) const {
return m_url < other.m_url;
}
-
+
+ template< typename CharT, typename TraitsT > friend
+ basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, URL& u );
+
protected:
string m_url;
};
+template< typename CharT, typename TraitsT >
+inline basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, URL& u ) {
+ s << u.m_url;
+ return s;
+}
+
#endif
diff --git a/src/crawlingwolf.cpp b/src/crawlingwolf.cpp
index 8e3e29a..bcecc50 100644
--- a/src/crawlingwolf.cpp
+++ b/src/crawlingwolf.cpp
@@ -9,14 +9,16 @@ int main( void )
Fetcher *fetcher = new LibFetchFetcher( );
Deduper *deduper = new MD5Deduper( );
- LOG( logINFO ) << "Crawler started..";
+ LOG( logNOTICE ) << "Crawler started..";
frontier->addUrl( URL( "http://www.andreasbaumann.cc" ) );
URL url;
while( ( url = frontier->getNextUrl( ) ) != URL::Null ) {
+ LOG( logINFO ) << "Got URL " << url;
RewindInputStream s = fetcher->fetch( url );
if( deduper->contentSeen( url, s ) ) {
+ LOG( logINFO ) << "URL " << url << " is a duplicate, content already seen";
continue;
}
}
@@ -25,7 +27,7 @@ int main( void )
delete fetcher;
delete frontier;
- LOG( logINFO ) << "Crawler stopped..";
+ LOG( logNOTICE ) << "Crawler stopped..";
return 0;
}
diff --git a/tests/libfetch/GNUmakefile b/tests/libfetch/GNUmakefile
index a8d7495..6f6099f 100644
--- a/tests/libfetch/GNUmakefile
+++ b/tests/libfetch/GNUmakefile
@@ -23,6 +23,9 @@ endif
TEST_BINS = \
test1$(EXE)
+TEST_CPP_BINS = \
+ test2$(EXE)
+
OBJS =
-include $(TOPDIR)/makefiles/gmake/sub.mk
diff --git a/tests/libfetch/test1.c b/tests/libfetch/test1.c
index fd28563..f912e3a 100644
--- a/tests/libfetch/test1.c
+++ b/tests/libfetch/test1.c
@@ -2,13 +2,13 @@
#include <stdlib.h>
#include <unistd.h>
-#include <fetch.h>
+#include "fetch.h"
int main( int argc, char *argv[] )
{
char *urlstring;
fetchIO *io;
- char buf[1024];
+ char buf[256];
ssize_t res;
if( argc != 2 ) {
@@ -27,6 +27,7 @@ int main( int argc, char *argv[] )
}
while( ( res = fetchIO_read( io, buf, sizeof( buf ) ) ) != 0 ) {
+ buf[res] = '\0';
puts( buf );
}