summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-08-12 10:40:23 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-08-12 10:40:23 +0200
commitfe4e07657381e7db947630981a1d34410e4753e3 (patch)
tree7daf065a73ab29fd954f81e40dfde9f6e66a022a
parentb2f35389b68fb1e58ea13dc273fdebaf534a325c (diff)
downloadcrawler-fe4e07657381e7db947630981a1d34410e4753e3.tar.gz
crawler-fe4e07657381e7db947630981a1d34410e4753e3.tar.bz2
streamhtmlparser works on Windows
-rw-r--r--GNUmakefile2
-rwxr-xr-xMakefile.W323
-rwxr-xr-xREADME.3rdPARTY6
-rw-r--r--src/modules/Makefile.W322
-rwxr-xr-xsrc/modules/processor/Makefile.W3213
-rwxr-xr-xsrc/modules/processor/htmllinkextract/Makefile.W3246
-rwxr-xr-xsrc/modules/urlnormalizer/googleurl/Makefile.W322
-rwxr-xr-xstreamhtmlparser/Makefile.W3238
-rwxr-xr-x[-rw-r--r--]streamhtmlparser/htmlparser.c5
-rwxr-xr-x[-rw-r--r--]streamhtmlparser/jsparser.c12
-rwxr-xr-xstreamhtmlparser/port.h20
-rwxr-xr-x[-rw-r--r--]streamhtmlparser/statemachine.h14
-rw-r--r--tests/Makefile.W322
-rwxr-xr-xtests/streamhtmlparser/Makefile.W3253
14 files changed, 197 insertions, 21 deletions
diff --git a/GNUmakefile b/GNUmakefile
index bbe2e17..bcd6576 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -1,6 +1,6 @@
TOPDIR = .
-SUBDIRS = libfetch streamhtmlparser googleurl sqlite3 src tests
+SUBDIRS = libfetch googleurl streamhtmlparser sqlite3 src tests
PACKAGE_NAME = CrawlingWolf
PACKAGE_VERSION = 0.0.1
diff --git a/Makefile.W32 b/Makefile.W32
index b1100cf..9985bfa 100755
--- a/Makefile.W32
+++ b/Makefile.W32
@@ -1,7 +1,6 @@
TOPDIR = .
-#SUBDIRS = libfetch streamhtmlparser googleurl sqlite3 src tests
-SUBDIRS = utils googleurl sqlite3 src tests
+SUBDIRS = utils googleurl streamhtmlparser sqlite3 src tests
PACKAGE_NAME = CrawlingWolf
PACKAGE_VERSION = 0.0.1
diff --git a/README.3rdPARTY b/README.3rdPARTY
index b50eed5..7347ab9 100755
--- a/README.3rdPARTY
+++ b/README.3rdPARTY
@@ -30,7 +30,11 @@ http://code.google.com/p/streamhtmlparser/
- added constant namespace 'streamhtmlparser' in htmlparser_cpp.h.in htmlparser_cpp.h
- changed system includes <streamhtmlparser/*> to "*"
- fixed some "error: comma at end of enumerator list [-Werror=pedantic]"
-
+- changes for Windows (in port.h):
+ - inline and __inline
+ - no strings.h on Windows
+ - str(n)casecmp is str(n)icmp and snprintf/_snprintf on Windows
+
google-url
----------
diff --git a/src/modules/Makefile.W32 b/src/modules/Makefile.W32
index ddd6338..fd4004b 100644
--- a/src/modules/Makefile.W32
+++ b/src/modules/Makefile.W32
@@ -2,7 +2,7 @@ TOPDIR = ..\..
SUBDIRS = \
urlnormalizer urlfilter frontier fetcher urlseen \
- deduper
+ deduper processor
#SUBDIRS = \
# urlnormalizer urlfilter frontier fetcher urlseen \
diff --git a/src/modules/processor/Makefile.W32 b/src/modules/processor/Makefile.W32
new file mode 100755
index 0000000..f98b918
--- /dev/null
+++ b/src/modules/processor/Makefile.W32
@@ -0,0 +1,13 @@
+TOPDIR = ..\..\..
+
+SUBDIRS = htmllinkextract
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_test:
diff --git a/src/modules/processor/htmllinkextract/Makefile.W32 b/src/modules/processor/htmllinkextract/Makefile.W32
new file mode 100755
index 0000000..f4161ba
--- /dev/null
+++ b/src/modules/processor/htmllinkextract/Makefile.W32
@@ -0,0 +1,46 @@
+TOPDIR = ..\..\..\..
+
+SUBDIRS =
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
+
+INCLUDE_CXXFLAGS = \
+ /D_WIN32_WINNT=0x504 /DSHARED
+
+INCLUDE_DIRS = \
+ /I. \
+ /I$(TOPDIR)\src \
+ /I$(TOPDIR)\streamhtmlparser
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib \
+ $(TOPDIR)\src\crawlingwolf.lib
+
+DYNAMIC_MODULE = \
+ mod_processor_htmllinkextract.dll
+
+STATIC_LIB = \
+ htmllinkextractprocessor.lib
+
+CPP_OBJS = \
+ HTMLLinkExtractProcessor.obj
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+$(STATIC_LIB): $(CPP_OBJS)
+ $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $?
+
+$(DYNAMIC_MODULE): $(CPP_OBJS)
+ $(LINK) /dll /nologo /out:$@ $(LDFLAGS) $(LIBS) $?
+
+local_all: $(STATIC_LIB) $(DYNAMIC_MODULE)
+
+local_clean:
+ @-erase $(LOCAL_STATIC_LIB) 2>NUL
+ @-erase $(CPP_OBJS) 2>NUL
+
+local_distclean:
+
+local_test:
diff --git a/src/modules/urlnormalizer/googleurl/Makefile.W32 b/src/modules/urlnormalizer/googleurl/Makefile.W32
index de4d644..712f493 100755
--- a/src/modules/urlnormalizer/googleurl/Makefile.W32
+++ b/src/modules/urlnormalizer/googleurl/Makefile.W32
@@ -10,7 +10,7 @@ INCLUDE_CXXFLAGS = \
INCLUDE_DIRS = \
/I. \
/I$(TOPDIR)\src \
- /I$(TOPDIR)/googleurl
+ /I$(TOPDIR)\googleurl
INCLUDE_LDFLAGS = \
diff --git a/streamhtmlparser/Makefile.W32 b/streamhtmlparser/Makefile.W32
new file mode 100755
index 0000000..050b5c3
--- /dev/null
+++ b/streamhtmlparser/Makefile.W32
@@ -0,0 +1,38 @@
+TOPDIR = ..
+
+SUBDIRS =
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
+
+INCLUDE_CXXFLAGS = \
+ /D_WIN32_WINNT=0x504
+
+INCLUDE_DIRS = \
+ /I.
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+
+CPP_OBJS = \
+ statemachine.obj \
+ jsparser.obj \
+ htmlparser.obj
+
+STATIC_LIB = \
+ streamhtmlparser.lib
+
+all: $(CPP_OBJS) $(STATIC_LIB)
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+$(STATIC_LIB): $(CPP_OBJS)
+ $(LINK) /lib /nologo /out:$@ $(STATIC_LDFLAGS) $?
+
+local_all:
+
+local_clean:
+
+local_distclean:
+
+local_test:
diff --git a/streamhtmlparser/htmlparser.c b/streamhtmlparser/htmlparser.c
index c88486a..3e820f7 100644..100755
--- a/streamhtmlparser/htmlparser.c
+++ b/streamhtmlparser/htmlparser.c
@@ -38,7 +38,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <strings.h>
#include <ctype.h>
#include <assert.h>
@@ -77,7 +76,7 @@ static struct entityfilter_table_s {
/* Utility functions */
/* Similar to strncpy() but avoids the NULL padding. */
-static inline void nopad_strncpy(char *dst, const char *src, size_t dst_size,
+static INLINE void nopad_strncpy(char *dst, const char *src, size_t dst_size,
size_t src_size)
{
size_t size;
@@ -103,7 +102,7 @@ static int state_external(int st)
*
* From: http://www.w3.org/TR/html401/struct/text.html#h-9.1
*/
-static inline int html_isspace(char chr)
+static INLINE int html_isspace(char chr)
{
if (chr == ' ' || chr == '\t' || chr == '\n' || chr == '\r') {
return 1;
diff --git a/streamhtmlparser/jsparser.c b/streamhtmlparser/jsparser.c
index 9d71c74..dc94cfc 100644..100755
--- a/streamhtmlparser/jsparser.c
+++ b/streamhtmlparser/jsparser.c
@@ -40,6 +40,8 @@
#include "statemachine.h"
#include "jsparser.h"
+#include "port.h"
+
/* So we can support both C and C++ compilers, we use the CAST() macro instead
* of using C style casts or static_cast<>() directly.
*/
@@ -117,7 +119,7 @@ static const char *regexp_token_prefix[] = {
/* Converts the internal state into the external superstate.
*/
-static inline int state_external(int state)
+static INLINE int state_external(int state)
{
assert(state < JSPARSER_NUM_STATES);
assert(state >= 0);
@@ -129,7 +131,7 @@ static inline int state_external(int state)
* with the exception of unicode space and line terminators:
* http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
*/
-static inline int js_is_whitespace(char c)
+static INLINE int js_is_whitespace(char c)
{
return c == '\t' || /* Tab 0x09 */
c == '\v' || /* Vertical Tab 0x0B */
@@ -147,7 +149,7 @@ static inline int js_is_whitespace(char c)
* For more detail on the limitations of having this relaxed set of characters
* please see the comments in_state_js_text().
*/
-static inline int js_is_identifier(char c) {
+static INLINE int js_is_identifier(char c) {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
@@ -202,7 +204,7 @@ void jsparser_buffer_append_str(jsparser_ctx *js, const char *str)
/* Returns the position relative to the start of the buffer or -1 if past the
* size of the buffer..
*/
-static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos)
+static INLINE int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos)
{
int absolute_pos;
int buffer_len;
@@ -363,7 +365,7 @@ static int bsearch_strcmp(const void *a, const void *b)
* precede a regular expression in the javascript grammar, and returns true if
* the argument is found on that list.
*/
-static inline int is_regexp_token_prefix(char *token)
+static INLINE int is_regexp_token_prefix(char *token)
{
assert(token != NULL);
diff --git a/streamhtmlparser/port.h b/streamhtmlparser/port.h
new file mode 100755
index 0000000..e826959
--- /dev/null
+++ b/streamhtmlparser/port.h
@@ -0,0 +1,20 @@
+#ifndef __INLINE_H
+#define __INLINE_H
+
+#ifdef _MSC_VER
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+
+#ifndef _WIN32
+#include <strings.h>
+#else
+#define WIN32_MEAN_AND_LEAN
+#include <windows.h>
+#define strcasecmp stricmp
+#define strncasecmp strnicmp
+#define snprintf _snprintf
+#endif
+
+#endif
diff --git a/streamhtmlparser/statemachine.h b/streamhtmlparser/statemachine.h
index a05ffe7..e586d35 100644..100755
--- a/streamhtmlparser/statemachine.h
+++ b/streamhtmlparser/statemachine.h
@@ -34,6 +34,8 @@
#ifndef STREAMHTMLPARSER_STATEMACHINE_H
#define STREAMHTMLPARSER_STATEMACHINE_H
+#include "port.h"
+
/* TODO(falmeida): I'm not sure about these limits, but since right now we only
* have 24 states it should be fine */
@@ -127,28 +129,28 @@ const char *statemachine_record_buffer(statemachine_ctx *ctx);
/* Returns the the number of characters currently stored in the record buffer.
*/
-static inline size_t statemachine_record_length(statemachine_ctx *ctx) {
+static INLINE size_t statemachine_record_length(statemachine_ctx *ctx) {
return ctx->record_pos + 1;
}
/* Return the current line number. */
-static inline int statemachine_get_line_number(statemachine_ctx *ctx) {
+static INLINE int statemachine_get_line_number(statemachine_ctx *ctx) {
return ctx->line_number;
}
/* Set the current line number. */
-static inline void statemachine_set_line_number(statemachine_ctx *ctx,
+static INLINE void statemachine_set_line_number(statemachine_ctx *ctx,
int line) {
ctx->line_number = line;
}
/* Return the current column number. */
-static inline int statemachine_get_column_number(statemachine_ctx *ctx) {
+static INLINE int statemachine_get_column_number(statemachine_ctx *ctx) {
return ctx->column_number;
}
/* Set the current column number. */
-static inline void statemachine_set_column_number(statemachine_ctx *ctx,
+static INLINE void statemachine_set_column_number(statemachine_ctx *ctx,
int column) {
ctx->column_number = column;
}
@@ -158,7 +160,7 @@ static inline void statemachine_set_column_number(statemachine_ctx *ctx,
*
* NULL is returned if the parser didn't encounter an error.
*/
-static inline const char *statemachine_get_error_msg(statemachine_ctx *ctx) {
+static INLINE const char *statemachine_get_error_msg(statemachine_ctx *ctx) {
if (ctx->next_state == STATEMACHINE_ERROR) {
return ctx->error_msg;
} else {
diff --git a/tests/Makefile.W32 b/tests/Makefile.W32
index b227403..25727a1 100644
--- a/tests/Makefile.W32
+++ b/tests/Makefile.W32
@@ -1,6 +1,6 @@
TOPDIR = ..
-SUBDIRS = utils url
+SUBDIRS = utils url streamhtmlparser
!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
diff --git a/tests/streamhtmlparser/Makefile.W32 b/tests/streamhtmlparser/Makefile.W32
new file mode 100755
index 0000000..dcd2b5f
--- /dev/null
+++ b/tests/streamhtmlparser/Makefile.W32
@@ -0,0 +1,53 @@
+TOPDIR = ..\..
+
+SUBDIRS =
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\platform.mk
+
+INCLUDE_CXXFLAGS = \
+ /D_WIN32_WINNT=0x504
+
+INCLUDE_DIRS = \
+ /I. \
+ /I$(TOPDIR)\streamhtmlparser
+
+INCLUDE_LDFLAGS = \
+
+INCLUDE_LIBS = \
+ $(TOPDIR)\streamhtmlparser\streamhtmlparser.lib
+
+TEST_CPP_BINS = \
+ test1.exe \
+ test2.exe
+
+OBJS =
+
+!INCLUDE $(TOPDIR)\makefiles\nmake\sub.mk
+
+test1.exe: test1.obj
+test2.exe: test2.obj
+
+local_all:
+
+local_clean:
+ @-erase -f *.RES *.DIFF *.ERR 2>NUL
+
+local_distclean:
+
+local_test:
+
+dummy:
+ @-for %%m in ( simple google ) do \
+ @echo Using URL normalizer '%m'.. & \
+ @exec_test test1 test1 "parse illegal protocol" %m parse www.andreasbaumann.cc & \
+ @exec_test test1 test2 "parse normal start URL without slash" %m parse http://www.andreasbaumann.cc & \
+ @exec_test test1 test3 "parse normal start URL with slash" %m parse http://www.andreasbaumann.cc/ & \
+ @exec_test test1 test4 "parse normal URL" %m parse http://www.andreasbaumann.cc/index.html & \
+ @exec_test test1 test5 "parse normal URL with default port" %m parse http://www.andreasbaumann.cc:80/index.html & \
+ @exec_test test1 test6 "parse normal URL with non-standard port" %m parse http://www.andreasbaumann.cc:8080/index.html & \
+ @exec_test test1 test100 "normalize a relative URL" %m normalize http://www.andreasbaumann.cc/index.html /software.html & \
+ @exec_test test1 test101 "absolute URL in HTML content" %m normalize http://www.andreasbaumann.cc/index.html http://www.yahoo.com/page.html & \
+ @exec_test test1 test102 "path normalization, relative path" %m normalize http://www.andreasbaumann.cc/adir/index.html bdir/page.html & \
+ @exec_test test1 test103 "path normalization, absolute path" %m normalize http://www.andreasbaumann.cc/adir/index.html /bdir/page.html & \
+ @exec_test test1 test104 "path normalization, current dir" %m normalize http://www.andreasbaumann.cc/adir/index.html ./bdir/page.html & \
+ @exec_test test1 test105 "path normalization, previous dir" %m normalize http://www.andreasbaumann.cc/adir/index.html ../bdir/page.html