diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-28 18:07:26 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-28 18:07:26 +0200 |
commit | 3856d7214b3b3eb3e5b8c3ac025b7aeeb93cd044 (patch) | |
tree | 456ce7af97f65c94bcf30319c218b45ddab2632f /src | |
parent | cbec8f229bb4d995c9fb05babf176e82a6f6db7c (diff) | |
download | crawler-3856d7214b3b3eb3e5b8c3ac025b7aeeb93cd044.tar.gz crawler-3856d7214b3b3eb3e5b8c3ac025b7aeeb93cd044.tar.bz2 |
started to add URL normalizers and testing environment for URLs
Diffstat (limited to 'src')
-rw-r--r-- | src/GNUmakefile | 16 | ||||
-rw-r--r-- | src/SimpleURLNormalizer.cpp | 14 | ||||
-rw-r--r-- | src/SimpleURLNormalizer.hpp | 13 | ||||
-rw-r--r-- | src/URL.hpp | 12 | ||||
-rw-r--r-- | src/URLNormalizer.hpp | 13 |
5 files changed, 62 insertions, 6 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile index 816d6be..3d3d7b8 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -26,7 +26,7 @@ INCLUDE_LIBS += \ $(OPENSSL_LIBS) endif -CPP_OBJS = \ +LOCAL_STATIC_LIB_OBJS = \ URL.o \ LibFetchFetcher.o \ LibFetchRewindInputStream.o \ @@ -35,14 +35,24 @@ CPP_OBJS = \ ProtocolURLFilter.o \ DomainURLFilter.o \ ChainURLFilter.o \ - MemoryURLSeen.o + MemoryURLSeen.o \ + SimpleURLNormalizer.o + +CPP_OBJS = \ + $(LOCAL_STATIC_LIB_OBJS) + +LOCAL_STATIC_LIB = \ + libcrawlingwolf.a CPP_BINS = \ crawlingwolf$(EXE) -include $(TOPDIR)/makefiles/gmake/sub.mk -local_all: +local_all: $(LOCAL_STATIC_LIB) + +$(LOCAL_STATIC_LIB): $(LOCAL_STATIC_LIB_OBJS) + ar rcs $(LOCAL_STATIC_LIB) $(LOCAL_STATIC_LIB_OBJS) local_clean: diff --git a/src/SimpleURLNormalizer.cpp b/src/SimpleURLNormalizer.cpp new file mode 100644 index 0000000..21c34ae --- /dev/null +++ b/src/SimpleURLNormalizer.cpp @@ -0,0 +1,14 @@ +#include "SimpleURLNormalizer.hpp" + +SimpleURLNormalizer::SimpleURLNormalizer( ) +{ +} + +bool SimpleURLNormalizer::normalize( URL &url, const URL contextUrl ) +{ + (void)url; + (void)contextUrl; + + return true; +} + diff --git a/src/SimpleURLNormalizer.hpp b/src/SimpleURLNormalizer.hpp new file mode 100644 index 0000000..433f4e8 --- /dev/null +++ b/src/SimpleURLNormalizer.hpp @@ -0,0 +1,13 @@ +#ifndef __SIMPLEURLNORMALIZER_H +#define __SIMPLEURLNORMALIZER_H + +#include "URLNormalizer.hpp" + +class SimpleURLNormalizer : public URLNormalizer { + public: + SimpleURLNormalizer( ); + + bool normalize( URL &url, const URL contextUrl ); +}; + +#endif diff --git a/src/URL.hpp b/src/URL.hpp index 0d1b113..fac0074 100644 --- a/src/URL.hpp +++ b/src/URL.hpp @@ -7,6 +7,9 @@ using namespace std; class URL { + protected: + string m_url; + public: URL( ) : m_url( "" ) { @@ -28,7 +31,7 @@ class URL { { } - string str( ) const + std::string str( ) const { return m_url; } @@ -54,6 +57,11 @@ class URL { return "/"; } + std::string fragment( ) const + { + return ""; + } + static URL Null; bool operator!=( const URL &other ) const { @@ -67,8 +75,6 @@ class URL { template< typename CharT, typename TraitsT > friend basic_ostream<CharT, TraitsT>& operator<<( basic_ostream<CharT, TraitsT>&s, const URL& u ); - protected: - string m_url; }; template< typename CharT, typename TraitsT > diff --git a/src/URLNormalizer.hpp b/src/URLNormalizer.hpp new file mode 100644 index 0000000..a1f6abf --- /dev/null +++ b/src/URLNormalizer.hpp @@ -0,0 +1,13 @@ +#ifndef __URLNORMALIZER_H +#define __URLNORMALIZER_H + +#include "URL.hpp" + +class URLNormalizer { + public: + virtual ~URLNormalizer( ) { }; + + virtual bool normalize( URL &url, const URL contextUrl ) = 0; +}; + +#endif |