summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-28 18:07:26 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-28 18:07:26 +0200
commit3856d7214b3b3eb3e5b8c3ac025b7aeeb93cd044 (patch)
tree456ce7af97f65c94bcf30319c218b45ddab2632f /src
parentcbec8f229bb4d995c9fb05babf176e82a6f6db7c (diff)
downloadcrawler-3856d7214b3b3eb3e5b8c3ac025b7aeeb93cd044.tar.gz
crawler-3856d7214b3b3eb3e5b8c3ac025b7aeeb93cd044.tar.bz2
started to add URL normalizers and testing environment for URLs
Diffstat (limited to 'src')
-rw-r--r--src/GNUmakefile16
-rw-r--r--src/SimpleURLNormalizer.cpp14
-rw-r--r--src/SimpleURLNormalizer.hpp13
-rw-r--r--src/URL.hpp12
-rw-r--r--src/URLNormalizer.hpp13
5 files changed, 62 insertions, 6 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 816d6be..3d3d7b8 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -26,7 +26,7 @@ INCLUDE_LIBS += \
$(OPENSSL_LIBS)
endif
-CPP_OBJS = \
+LOCAL_STATIC_LIB_OBJS = \
URL.o \
LibFetchFetcher.o \
LibFetchRewindInputStream.o \
@@ -35,14 +35,24 @@ CPP_OBJS = \
ProtocolURLFilter.o \
DomainURLFilter.o \
ChainURLFilter.o \
- MemoryURLSeen.o
+ MemoryURLSeen.o \
+ SimpleURLNormalizer.o
+
+CPP_OBJS = \
+ $(LOCAL_STATIC_LIB_OBJS)
+
+LOCAL_STATIC_LIB = \
+ libcrawlingwolf.a
CPP_BINS = \
crawlingwolf$(EXE)
-include $(TOPDIR)/makefiles/gmake/sub.mk
-local_all:
+local_all: $(LOCAL_STATIC_LIB)
+
+$(LOCAL_STATIC_LIB): $(LOCAL_STATIC_LIB_OBJS)
+ ar rcs $(LOCAL_STATIC_LIB) $(LOCAL_STATIC_LIB_OBJS)
local_clean:
diff --git a/src/SimpleURLNormalizer.cpp b/src/SimpleURLNormalizer.cpp
new file mode 100644
index 0000000..21c34ae
--- /dev/null
+++ b/src/SimpleURLNormalizer.cpp
@@ -0,0 +1,14 @@
+#include "SimpleURLNormalizer.hpp"
+
+SimpleURLNormalizer::SimpleURLNormalizer( )
+{
+}
+
+bool SimpleURLNormalizer::normalize( URL &url, const URL contextUrl )
+{
+ (void)url;
+ (void)contextUrl;
+
+ return true;
+}
+
diff --git a/src/SimpleURLNormalizer.hpp b/src/SimpleURLNormalizer.hpp
new file mode 100644
index 0000000..433f4e8
--- /dev/null
+++ b/src/SimpleURLNormalizer.hpp
@@ -0,0 +1,13 @@
+#ifndef __SIMPLEURLNORMALIZER_H
+#define __SIMPLEURLNORMALIZER_H
+
+#include "URLNormalizer.hpp"
+
+class SimpleURLNormalizer : public URLNormalizer {
+ public:
+ SimpleURLNormalizer( );
+
+ bool normalize( URL &url, const URL contextUrl );
+};
+
+#endif
diff --git a/src/URL.hpp b/src/URL.hpp
index 0d1b113..fac0074 100644
--- a/src/URL.hpp
+++ b/src/URL.hpp
@@ -7,6 +7,9 @@
using namespace std;
class URL {
+ protected:
+ string m_url;
+
public:
URL( )
: m_url( "" ) {
@@ -28,7 +31,7 @@ class URL {
{
}
- string str( ) const
+ std::string str( ) const
{
return m_url;
}
@@ -54,6 +57,11 @@ class URL {
return "/";
}
+ std::string fragment( ) const
+ {
+ return "";
+ }
+
static URL Null;
bool operator!=( const URL &other ) const {
@@ -67,8 +75,6 @@ class URL {
template< typename CharT, typename TraitsT > friend
basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u );
- protected:
- string m_url;
};
template< typename CharT, typename TraitsT >
diff --git a/src/URLNormalizer.hpp b/src/URLNormalizer.hpp
new file mode 100644
index 0000000..a1f6abf
--- /dev/null
+++ b/src/URLNormalizer.hpp
@@ -0,0 +1,13 @@
+#ifndef __URLNORMALIZER_H
+#define __URLNORMALIZER_H
+
+#include "URL.hpp"
+
+class URLNormalizer {
+ public:
+ virtual ~URLNormalizer( ) { };
+
+ virtual bool normalize( URL &url, const URL contextUrl ) = 0;
+};
+
+#endif