summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-04 20:30:07 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-04 20:30:07 +0200
commit39116e2604cc4059828ae0bca694c38a52fef810 (patch)
treee8eee7f209cbe5a9b2c4cc60223ccf494d6e8221 /src
parent79abc6b0891223fa1c9c0f57769cd58e562f22f1 (diff)
downloadcrawler-39116e2604cc4059828ae0bca694c38a52fef810.tar.gz
crawler-39116e2604cc4059828ae0bca694c38a52fef810.tar.bz2
rearanged google test1 and added a GoogleUrlNormalizer
Diffstat (limited to 'src')
-rw-r--r--src/GNUmakefile10
-rw-r--r--src/GoogleURLNormalizer.cpp72
-rw-r--r--src/GoogleURLNormalizer.hpp26
3 files changed, 105 insertions, 3 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 4abdd22..eaf57c8 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -10,11 +10,14 @@ INCLUDE_LDFLAGS = \
INCLUDE_DIRS = \
-I$(TOPDIR)/libfetch \
- -I$(TOPDIR)/streamhtmlparser
+ -I$(TOPDIR)/streamhtmlparser \
+ -I$(TOPDIR)/googleurl
INCLUDE_LIBS = \
$(TOPDIR)/libfetch/libfetch.a \
- $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a
+ $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a \
+ $(TOPDIR)/googleurl/libgoogleurl.a \
+ -licui18n -licuuc
# openssl
ifeq ($(WITH_SSL),1)
@@ -36,7 +39,8 @@ LOCAL_STATIC_LIB_OBJS = \
HostURLFilter.o \
ChainURLFilter.o \
MemoryURLSeen.o \
- SimpleURLNormalizer.o
+ SimpleURLNormalizer.o \
+ GoogleURLNormalizer.o
CPP_OBJS = \
$(LOCAL_STATIC_LIB_OBJS)
diff --git a/src/GoogleURLNormalizer.cpp b/src/GoogleURLNormalizer.cpp
new file mode 100644
index 0000000..c49a831
--- /dev/null
+++ b/src/GoogleURLNormalizer.cpp
@@ -0,0 +1,72 @@
+#include "GoogleURLNormalizer.hpp"
+
+#include <string>
+
+#include "url_util.h"
+#include "url_canon_stdstring.h"
+#include "url_parse.h"
+
+using namespace std;
+using namespace url_util;
+using namespace url_canon;
+using namespace url_parse;
+
+GoogleURLNormalizer::GoogleURLNormalizer( )
+{
+ Initialize( );
+}
+
+GoogleURLNormalizer::~GoogleURLNormalizer( )
+{
+ Shutdown( );
+}
+
+string GoogleURLNormalizer::componentString( const string &s, const Component &comp ) const
+{
+ if( comp.len <= 0 ) {
+ return string( );
+ } else {
+ return string( s, comp.begin, comp.len );
+ }
+}
+
+URL GoogleURLNormalizer::parseUrl( const string s )
+{
+ if( s.empty( ) ) {
+ return URL::Null;
+ }
+
+ string canonical;
+ canonical.reserve( s.size( ) + 32 );
+ StdStringCanonOutput output( &canonical );
+ Parsed parsed;
+ bool success = Canonicalize(
+ s.data( ), static_cast<int>( s.length( ) ),
+ NULL, &output, &parsed );
+ if( !success ) {
+ return URL::Null;
+ }
+ output.Complete( );
+
+ unsigned short port;
+ if( parsed.port.len >= 0 ) {
+ port = (unsigned short)atoi(
+ componentString( canonical, parsed.port ).c_str( ) );
+ } else {
+ port = URL::defaultPort(
+ componentString( canonical, parsed.scheme ) );
+ }
+
+ return URL( componentString( canonical, parsed.scheme ),
+ componentString( canonical, parsed.host ),
+ port,
+ componentString( canonical, parsed.path ),
+ "", "" );
+}
+
+URL GoogleURLNormalizer::normalize( const URL url, const string s )
+{
+ (void)url;
+ (void)s;
+ return URL::Null;
+}
diff --git a/src/GoogleURLNormalizer.hpp b/src/GoogleURLNormalizer.hpp
new file mode 100644
index 0000000..d630d5f
--- /dev/null
+++ b/src/GoogleURLNormalizer.hpp
@@ -0,0 +1,26 @@
+#ifndef __GOOGLEURLNORMALIZER_H
+#define __GOOGLEURLNORMALIZER_H
+
+#include "URLNormalizer.hpp"
+
+//TODO: will fix later, bad include here!
+#include "url_parse.h"
+
+class GoogleURLNormalizer : public URLNormalizer {
+ public:
+ GoogleURLNormalizer( );
+
+ virtual ~GoogleURLNormalizer( );
+
+ virtual URL parseUrl( const std::string s );
+
+ virtual URL normalize( const URL url, const std::string s );
+
+ private:
+ //TODO: hide implementation details here (PIMPL) or don't
+ //allocate Normalizers, use a factory method (as this is
+ //anyway better for loadable module support!)
+ std::string componentString( const std::string &s, const url_parse::Component &comp ) const;
+};
+
+#endif