summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/GNUmakefile10
-rw-r--r--src/GoogleURLNormalizer.cpp72
-rw-r--r--src/GoogleURLNormalizer.hpp26
-rw-r--r--tests/googleurl/GNUmakefile3
-rw-r--r--tests/googleurl/test1.cpp38
5 files changed, 120 insertions, 29 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index 4abdd22..eaf57c8 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -10,11 +10,14 @@ INCLUDE_LDFLAGS = \
INCLUDE_DIRS = \
-I$(TOPDIR)/libfetch \
- -I$(TOPDIR)/streamhtmlparser
+ -I$(TOPDIR)/streamhtmlparser \
+ -I$(TOPDIR)/googleurl
INCLUDE_LIBS = \
$(TOPDIR)/libfetch/libfetch.a \
- $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a
+ $(TOPDIR)/streamhtmlparser/libstreamhtmlparser.a \
+ $(TOPDIR)/googleurl/libgoogleurl.a \
+ -licui18n -licuuc
# openssl
ifeq ($(WITH_SSL),1)
@@ -36,7 +39,8 @@ LOCAL_STATIC_LIB_OBJS = \
HostURLFilter.o \
ChainURLFilter.o \
MemoryURLSeen.o \
- SimpleURLNormalizer.o
+ SimpleURLNormalizer.o \
+ GoogleURLNormalizer.o
CPP_OBJS = \
$(LOCAL_STATIC_LIB_OBJS)
diff --git a/src/GoogleURLNormalizer.cpp b/src/GoogleURLNormalizer.cpp
new file mode 100644
index 0000000..c49a831
--- /dev/null
+++ b/src/GoogleURLNormalizer.cpp
@@ -0,0 +1,72 @@
+#include "GoogleURLNormalizer.hpp"
+
+#include <string>
+
+#include "url_util.h"
+#include "url_canon_stdstring.h"
+#include "url_parse.h"
+
+using namespace std;
+using namespace url_util;
+using namespace url_canon;
+using namespace url_parse;
+
+GoogleURLNormalizer::GoogleURLNormalizer( )
+{
+ Initialize( );
+}
+
+GoogleURLNormalizer::~GoogleURLNormalizer( )
+{
+ Shutdown( );
+}
+
+string GoogleURLNormalizer::componentString( const string &s, const Component &comp ) const
+{
+ if( comp.len <= 0 ) {
+ return string( );
+ } else {
+ return string( s, comp.begin, comp.len );
+ }
+}
+
+URL GoogleURLNormalizer::parseUrl( const string s )
+{
+ if( s.empty( ) ) {
+ return URL::Null;
+ }
+
+ string canonical;
+ canonical.reserve( s.size( ) + 32 );
+ StdStringCanonOutput output( &canonical );
+ Parsed parsed;
+ bool success = Canonicalize(
+ s.data( ), static_cast<int>( s.length( ) ),
+ NULL, &output, &parsed );
+ if( !success ) {
+ return URL::Null;
+ }
+ output.Complete( );
+
+ unsigned short port;
+ if( parsed.port.len >= 0 ) {
+ port = (unsigned short)atoi(
+ componentString( canonical, parsed.port ).c_str( ) );
+ } else {
+ port = URL::defaultPort(
+ componentString( canonical, parsed.scheme ) );
+ }
+
+ return URL( componentString( canonical, parsed.scheme ),
+ componentString( canonical, parsed.host ),
+ port,
+ componentString( canonical, parsed.path ),
+ "", "" );
+}
+
+URL GoogleURLNormalizer::normalize( const URL url, const string s )
+{
+ (void)url;
+ (void)s;
+ return URL::Null;
+}
diff --git a/src/GoogleURLNormalizer.hpp b/src/GoogleURLNormalizer.hpp
new file mode 100644
index 0000000..d630d5f
--- /dev/null
+++ b/src/GoogleURLNormalizer.hpp
@@ -0,0 +1,26 @@
+#ifndef __GOOGLEURLNORMALIZER_H
+#define __GOOGLEURLNORMALIZER_H
+
+#include "URLNormalizer.hpp"
+
+//TODO: will fix later, bad include here!
+#include "url_parse.h"
+
+class GoogleURLNormalizer : public URLNormalizer {
+ public:
+ GoogleURLNormalizer( );
+
+ virtual ~GoogleURLNormalizer( );
+
+ virtual URL parseUrl( const std::string s );
+
+ virtual URL normalize( const URL url, const std::string s );
+
+ private:
+ //TODO: hide implementation details here (PIMPL) or don't
+ //allocate Normalizers, use a factory method (as this is
+ //anyway better for loadable module support!)
+ std::string componentString( const std::string &s, const url_parse::Component &comp ) const;
+};
+
+#endif
diff --git a/tests/googleurl/GNUmakefile b/tests/googleurl/GNUmakefile
index 387a9f2..cd7ba6a 100644
--- a/tests/googleurl/GNUmakefile
+++ b/tests/googleurl/GNUmakefile
@@ -2,12 +2,15 @@ TOPDIR = ../..
SUBDIRS =
+#TODO: hide include dependency on googleurl here!
INCLUDE_DIRS = \
+ -I$(TOPDIR)/src \
-I$(TOPDIR)/googleurl
INCLUDE_LDFLAGS =
INCLUDE_LIBS = \
+ $(TOPDIR)/src/libcrawlingwolf.a \
$(TOPDIR)/googleurl/libgoogleurl.a \
-licui18n -licuuc
diff --git a/tests/googleurl/test1.cpp b/tests/googleurl/test1.cpp
index a5b069d..278be5e 100644
--- a/tests/googleurl/test1.cpp
+++ b/tests/googleurl/test1.cpp
@@ -1,43 +1,30 @@
-#include "url_util.h"
-#include "url_canon_stdstring.h"
-#include "url_parse.h"
+#include "URL.hpp"
+#include "GoogleURLNormalizer.hpp"
#include <iostream>
#include <string>
-using namespace url_util;
-using namespace url_canon;
-using namespace url_parse;
using namespace std;
int main( int argc, char *argv[] )
{
- Initialize( );
-
if( argc != 2 ) {
cerr << "usage: test1 <url>\n" << endl;
return 1;
}
- string urlstring = argv[1];
- string canonical;
- canonical.reserve( urlstring.size( ) + 32 );
- StdStringCanonOutput output( &canonical );
- Parsed parsed;
- bool success = Canonicalize(
- urlstring.data( ), static_cast<int>( urlstring.length( ) ),
- NULL, &output, &parsed );
- if( !success ) {
+ char *urlstring = argv[1];
+
+ URLNormalizer *normalizer = new GoogleURLNormalizer( );
+ URL url = normalizer->parseUrl( urlstring );
+ delete normalizer;
+
+ if( url == URL::Null ) {
cerr << "Illegal URL!" << endl;
return 1;
}
- output.Complete( );
-
- cout << "URL: " << canonical << endl;
-
- Shutdown( );
-
-/* cout << "protocol: " << url.protocol( ) << endl
+
+ cout << "protocol: " << url.protocol( ) << endl
<< "host: " << url.host( ) << endl
<< "port: " << url.port( ) << endl
<< "path: " << url.path( ) << endl
@@ -45,7 +32,6 @@ int main( int argc, char *argv[] )
<< "fragment: " << url.fragment( ) << endl;
cout << "URL: " << url << endl;
-*/
-
+
return 0;
}