summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-11 18:42:26 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-11 18:42:26 +0200
commit8264f32b2233c1f46446c51ea5a2983fd9f76497 (patch)
treed3f5751d94b54b2effdb2485b180e806f1fe0625
parent16795b49385577c92dc891dd4788728cc541bdc9 (diff)
downloadcrawler-8264f32b2233c1f46446c51ea5a2983fd9f76497.tar.gz
crawler-8264f32b2233c1f46446c51ea5a2983fd9f76497.tar.bz2
google url normalization works on Windows, test1 must be improved:
there are linking problems (/DSHARED in *.lib normalization libraries produce clashing registry structures)
-rwxr-xr-xMakefile.W322
-rwxr-xr-xgoogleurl/Makefile.W328
-rwxr-xr-xmakefiles/nmake/config.mk5
-rwxr-xr-xsrc/modules/urlnormalizer/googleurl/Makefile.W323
-rwxr-xr-xtests/url/Makefile.W3210
-rwxr-xr-xtests/url/test1.cpp16
6 files changed, 31 insertions, 13 deletions
diff --git a/Makefile.W32 b/Makefile.W32
index a22e0e3..b1100cf 100755
--- a/Makefile.W32
+++ b/Makefile.W32
@@ -1,7 +1,7 @@
TOPDIR = .
#SUBDIRS = libfetch streamhtmlparser googleurl sqlite3 src tests
-SUBDIRS = utils sqlite3 src tests
+SUBDIRS = utils googleurl sqlite3 src tests
PACKAGE_NAME = CrawlingWolf
PACKAGE_VERSION = 0.0.1
diff --git a/googleurl/Makefile.W32 b/googleurl/Makefile.W32
index b2af6be..81bf240 100755
--- a/googleurl/Makefile.W32
+++ b/googleurl/Makefile.W32
@@ -9,12 +9,14 @@ INCLUDE_CXXFLAGS = \
/DNDEBUG /DWIN32
INCLUDE_DIRS = \
- /I.
+ /I. \
+ /I"$(ICU_DIR)\include"
INCLUDE_LDFLAGS = \
-
+ /LIBPATH:"$(ICU_DIR)\lib"
+
INCLUDE_LIBS = \
-
+
CPP_OBJS = \
url_canon_etc.obj \
url_canon_filesystemurl.obj \
diff --git a/makefiles/nmake/config.mk b/makefiles/nmake/config.mk
index a3d0484..5593036 100755
--- a/makefiles/nmake/config.mk
+++ b/makefiles/nmake/config.mk
@@ -8,6 +8,11 @@
# please customize
+# ICU
+#####
+
+ICU_DIR = C:\cygwin\home\Andreas Baumann\icu
+
# OpenSSL (http://www.slproweb.com/products/Win32OpenSSL.html)
##############################################################
diff --git a/src/modules/urlnormalizer/googleurl/Makefile.W32 b/src/modules/urlnormalizer/googleurl/Makefile.W32
index f739b32..de4d644 100755
--- a/src/modules/urlnormalizer/googleurl/Makefile.W32
+++ b/src/modules/urlnormalizer/googleurl/Makefile.W32
@@ -16,7 +16,8 @@ INCLUDE_LDFLAGS = \
INCLUDE_LIBS = \
$(TOPDIR)\googleurl\googleurl.lib \
- $(TOPDIR)\src\crawlingwolf.lib
+ $(TOPDIR)\src\crawlingwolf.lib \
+ "$(ICU_DIR)\lib\icuuc.lib"
DYNAMIC_MODULE = \
mod_urlnormalizer_googleurl.dll
diff --git a/tests/url/Makefile.W32 b/tests/url/Makefile.W32
index c65000f..cd545dd 100755
--- a/tests/url/Makefile.W32
+++ b/tests/url/Makefile.W32
@@ -11,13 +11,17 @@ INCLUDE_CXXFLAGS = \
INCLUDE_DIRS = \
/I. \
/I$(TOPDIR)\src \
- /I$(TOPDIR)\src\modules\urlnormalizer\simpleurl
+ /I$(TOPDIR)\src\modules\urlnormalizer\simpleurl \
+ /I$(TOPDIR)\src\modules\urlnormalizer\googleurl
INCLUDE_LDFLAGS = \
INCLUDE_LIBS = \
$(TOPDIR)\src\crawlingwolf.lib \
- $(TOPDIR)\src\modules\urlnormalizer\simpleurl\simpleurlnormalizer.lib
+ $(TOPDIR)\src\modules\urlnormalizer\googleurl\googleurlnormalizer.lib \
+ $(TOPDIR)\googleurl\googleurl.lib \
+ "$(ICU_DIR)\lib\icuuc.lib"
+# $(TOPDIR)\src\modules\urlnormalizer\simpleurl\simpleurlnormalizer.lib \
TEST_CPP_BINS = \
test1.exe
@@ -36,6 +40,8 @@ local_clean:
local_distclean:
local_test:
+ @-copy "$(ICU_DIR)\bin\icuuc49.dll" .
+ @-copy "$(ICU_DIR)\bin\icudt49.dll" .
@-exec_test test1 test1 "parse illegal protocol" simple parse www.andreasbaumann.cc
@-exec_test test1 test2 "parse normal start URL without slash" simple parse http://www.andreasbaumann.cc
@-exec_test test1 test3 "parse normal start URL with slash" simple parse http://www.andreasbaumann.cc/
diff --git a/tests/url/test1.cpp b/tests/url/test1.cpp
index 2512e79..b46e690 100755
--- a/tests/url/test1.cpp
+++ b/tests/url/test1.cpp
@@ -4,7 +4,7 @@
#include "ModuleLoader.hpp"
#else
#include "SimpleURLNormalizer.hpp"
-//#include "GoogleURLNormalizer.hpp"
+#include "GoogleURLNormalizer.hpp"
#endif
#include <vector>
@@ -29,18 +29,22 @@ int main( int argc, char *argv[] )
#ifdef USE_MODULELOADER
vector<string> modules;
- // modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
+#ifndef _WIN32
+ modules.push_back( "../../src/modules/urlnormalizer/simpleurl/mod_urlnormalizer_simple.so" );
+ modules.push_back( "../../src/modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
+#else
modules.push_back( "..\\..\\src\\modules\\urlnormalizer\\simpleurl\\mod_urlnormalizer_simple.dll" );
- // modules.push_back( "../../src/modules/urlnormalizer/googleurl/mod_urlnormalizer_googleurl.so" );
+ modules.push_back( "..\\..\\src\\modules\\urlnormalizer\\googleurl\\mod_urlnormalizer_googleurl.dll" );
+#endif
ModuleLoader<URLNormalizer> urlNormalizers( modules );
URLNormalizer *normalizer = urlNormalizers.create( method );
#else
URLNormalizer *normalizer;
if( strcmp( method, "simple" ) == 0 ) {
- normalizer = new SimpleURLNormalizer( );
- // } else if( strcmp( method, "google" ) == 0 ) {
- // normalizer = new GoogleURLNormalizer( );
+// normalizer = new SimpleURLNormalizer( );
+ } else if( strcmp( method, "google" ) == 0 ) {
+ normalizer = new GoogleURLNormalizer( );
} else {
cerr << "Unknown normalization method '" << method << "'" << endl;
return 1;