summaryrefslogtreecommitdiff
path: root/streamhtmlparser/statemachine.c
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-14 17:16:21 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-14 17:16:21 +0200
commit54cce110784d33d658b5f78286a98bee244a9eeb (patch)
tree9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/statemachine.c
parentfcb682cb1955d362390665330fdf476cab7dc10b (diff)
downloadcrawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz
crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2
added streamhtmlparser
Diffstat (limited to 'streamhtmlparser/statemachine.c')
-rw-r--r--streamhtmlparser/statemachine.c450
1 files changed, 450 insertions, 0 deletions
diff --git a/streamhtmlparser/statemachine.c b/streamhtmlparser/statemachine.c
new file mode 100644
index 0000000..c0d604e
--- /dev/null
+++ b/streamhtmlparser/statemachine.c
@@ -0,0 +1,450 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Filipe Almeida
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "statemachine.h"
+
+/* So we can support both C and C++ compilers, we use the CAST() macro instead
+ * of using C style casts or static_cast<>() directly.
+ */
+#ifdef __cplusplus
+ #define CAST(type, expression) (static_cast<type>(expression))
+#else
+ #define CAST(type, expression) ((type)(expression))
+#endif
+
+#define MAX_CHAR_8BIT 256
+
+/* Populates the statemachine definition.
+ */
+void statemachine_definition_populate(statemachine_definition *def,
+ const int* const* transition_table,
+ const char* const* state_names)
+{
+ assert(def != NULL);
+ assert(transition_table != NULL);
+
+ def->transition_table = transition_table;
+
+ def->state_names = state_names;
+}
+
+/* Add's the callback for the event in_state that is called when the
+ * statemachine is in state st.
+ *
+ * This event is called everytime the the statemachine is in the specified
+ * state forevery character in the input stream even if the state remains
+ * the same.
+ *
+ * This is event is the last event to be called and is fired after both events
+ * exit_state and enter_state.
+ */
+void statemachine_in_state(statemachine_definition *def, int st,
+ state_event_function func)
+{
+ assert(def != NULL);
+ assert(st < def->num_states);
+ def->in_state_events[st] = func;
+}
+
+/* Add's the callback for the event enter_state that is called when the
+ * statemachine enters state st.
+ *
+ * This event is fired after the event exit_state but before the event
+ * in_state.
+ */
+void statemachine_enter_state(statemachine_definition *def, int st,
+ state_event_function func)
+{
+ assert(def != NULL);
+ assert(st < def->num_states);
+ def->enter_state_events[st] = func;
+}
+
+/* Add's the callback for the event exit_state that is called when the
+ * statemachine exits from state st.
+ *
+ * This is the first event to be called and is fired before both the events
+ * enter_state and in_state.
+ */
+void statemachine_exit_state(statemachine_definition *def, int st,
+ state_event_function func)
+{
+ assert(def != NULL);
+ assert(st < def->num_states);
+ def->exit_state_events[st] = func;
+}
+
+/* Initializes a new statemachine definition with a defined number of states.
+ *
+ * Returns NULL if initialization fails.
+ *
+ * Initialization failure is fatal, and if this function fails it may not
+ * deallocate all previsouly allocated memory.
+ */
+statemachine_definition *statemachine_definition_new(int states)
+{
+ statemachine_definition *def;
+ def = CAST(statemachine_definition *,
+ malloc(sizeof(statemachine_definition)));
+ if (def == NULL)
+ return NULL;
+
+ def->in_state_events = CAST(state_event_function *,
+ calloc(states, sizeof(state_event_function)));
+ if (def->in_state_events == NULL)
+ return NULL;
+
+ def->enter_state_events =CAST(state_event_function *,
+ calloc(states,
+ sizeof(state_event_function)));
+ if (def->enter_state_events == NULL)
+ return NULL;
+
+ def->exit_state_events = CAST(state_event_function *,
+ calloc(states, sizeof(state_event_function)));
+ if (def->exit_state_events == NULL)
+ return NULL;
+
+ def->num_states = states;
+ def->state_names = NULL;
+ return def;
+}
+
+/* Deallocates a statemachine definition object
+ */
+void statemachine_definition_delete(statemachine_definition *def)
+{
+ assert(def != NULL);
+ free(def->in_state_events);
+ free(def->enter_state_events);
+ free(def->exit_state_events);
+ free(def);
+}
+
+/* Returns the current state.
+ */
+int statemachine_get_state(statemachine_ctx *ctx) {
+ return ctx->current_state;
+}
+
+/* Sets the current state.
+ *
+ * It calls the exit event for the old state and the enter event for the state
+ * we intend to move into.
+ *
+ * Since this state change was not initiated by a character in the input stream
+ * we pass a null char to the event functions.
+ */
+void statemachine_set_state(statemachine_ctx *ctx, int state)
+{
+
+ statemachine_definition *def;
+
+ assert(ctx != NULL);
+ assert(ctx->definition != NULL);
+
+ def = ctx->definition;
+
+ assert(state < def->num_states);
+
+ ctx->next_state = state;
+
+ if (ctx->current_state != ctx->next_state) {
+ if (def->exit_state_events[ctx->current_state])
+ def->exit_state_events[ctx->current_state](ctx,
+ ctx->current_state,
+ '\0',
+ ctx->next_state);
+
+ if (def->enter_state_events[ctx->next_state])
+ def->enter_state_events[ctx->next_state](ctx,
+ ctx->current_state,
+ '\0',
+ ctx->next_state);
+ }
+
+ ctx->current_state = state;
+}
+
+/* Reset the statemachine.
+ *
+ * The state is set to the initialization values. This includes setting the
+ * state to the default state (0), stopping recording and setting the line
+ * number to 1.
+ */
+void statemachine_reset(statemachine_ctx *ctx)
+{
+ ctx->current_state = 0;
+ ctx->next_state = 0;
+ ctx->record_buffer[0] = '\0';
+ ctx->record_pos = 0;
+ ctx->recording = 0;
+ ctx->line_number = 1;
+ ctx->column_number = 1;
+}
+
+/* Initializes a new statemachine. Receives a statemachine definition object
+ * that should have been initialized with statemachine_definition_new() and a
+ * user reference to be used by the caller.
+ *
+ * The user reference is used by the caller to store any instance specific data
+ * the caller may need and is typically used to propagate context information
+ * to the event callbacks. The user pointer can just be set to NULL if the
+ * caller doesn't need it.
+ *
+ * Returns NULL if initialization fails.
+ *
+ * Initialization failure is fatal, and if this function fails it may not
+ * deallocate all previously allocated memory.
+ */
+statemachine_ctx *statemachine_new(statemachine_definition *def,
+ void *user)
+{
+ statemachine_ctx *ctx;
+ assert(def != NULL);
+ ctx = CAST(statemachine_ctx *, malloc(sizeof(statemachine_ctx)));
+ if (ctx == NULL)
+ return NULL;
+
+ statemachine_reset(ctx);
+
+ ctx->definition = def;
+ ctx->user = user;
+
+ return ctx;
+}
+
+/* Returns a pointer to a context which is a duplicate of the statemachine src.
+ * The statemachine definition and the user pointer have to be provided since
+ * these references are not owned by the statemachine itself, but this will be
+ * shallow copies as they point to data structures we do not own.
+ */
+statemachine_ctx *statemachine_duplicate(statemachine_ctx *src,
+ statemachine_definition *def,
+ void *user)
+{
+ statemachine_ctx *dst;
+ assert(src != NULL);
+ dst = statemachine_new(def, user);
+ if (dst == NULL)
+ return NULL;
+
+ statemachine_copy(dst, src, def, user);
+
+ return dst;
+}
+
+/* Copies the context of the statemachine pointed to by src to the statemachine
+ * provided by dst.
+ * The statemachine definition and the user pointer have to be provided since
+ * these references are not owned by the statemachine itself.
+ */
+void statemachine_copy(statemachine_ctx *dst,
+ statemachine_ctx *src,
+ statemachine_definition *def,
+ void *user)
+{
+ memcpy(dst, src, sizeof(statemachine_ctx));
+ dst->definition = def;
+ dst->user = user;
+}
+
+/* Deallocates a statemachine object
+ */
+void statemachine_delete(statemachine_ctx *ctx)
+{
+ assert(ctx != NULL);
+ free(ctx);
+}
+
+/* Starts recording the current input stream into an internal buffer.
+ * The current input character is included in the recording.
+ */
+void statemachine_start_record(statemachine_ctx *ctx)
+{
+ assert(ctx != NULL);
+ ctx->record_buffer[0] = '\0';
+ ctx->record_pos = 0;
+ ctx->recording = 1;
+}
+
+/* Stops recording the current input stream.
+ * The last input character is not included in the recording.
+ * This function returns a pointer to the recorded string buffer.
+ */
+const char *statemachine_stop_record(statemachine_ctx *ctx)
+{
+ assert(ctx != NULL);
+ assert(ctx->recording);
+ ctx->record_buffer[ctx->record_pos] = '\0';
+ ctx->recording = 0;
+ return ctx->record_buffer;
+}
+
+ /* Returns a pointer to the record string buffer.
+ */
+const char *statemachine_record_buffer(statemachine_ctx *ctx)
+{
+ return ctx->record_buffer;
+}
+
+void statemachine_encode_char(char schr, char *output, size_t len)
+{
+ unsigned char chr = schr;
+ if (chr == '\'') {
+ strncpy(output, "\\'", len);
+ } else if (chr == '\\') {
+ strncpy(output, "\\\\", len);
+
+ /* Like isprint() but not dependent on locale. */
+ } else if (chr >= 32 && chr <= 126) {
+ snprintf(output, len, "%c", chr);
+ } else if (chr == '\n') {
+ strncpy(output, "\\n", len);
+ } else if (chr == '\r') {
+ strncpy(output, "\\r", len);
+ } else if (chr == '\t') {
+ strncpy(output, "\\t", len);
+ } else {
+ snprintf(output, len, "\\x%.2x", chr);
+ }
+
+ output[len - 1] = '\0';
+}
+
+/* Sets the error message in case of a transition error.
+ *
+ * Called from statemachine_parse to set the error message in case of a
+ * transition error.
+ */
+static void statemachine_set_transition_error_message(statemachine_ctx *ctx)
+{
+ char encoded_char[10];
+ statemachine_encode_char(ctx->current_char, encoded_char,
+ sizeof(encoded_char));
+
+ if (ctx->definition->state_names) {
+ snprintf(ctx->error_msg, STATEMACHINE_MAX_STR_ERROR,
+ "Unexpected character '%s' in state '%s'",
+ encoded_char,
+ ctx->definition->state_names[ctx->current_state]);
+ } else {
+ snprintf(ctx->error_msg, STATEMACHINE_MAX_STR_ERROR,
+ "Unexpected character '%s'", encoded_char);
+ }
+
+}
+
+/* Parses the input html stream and returns the finishing state.
+ *
+ * Returns STATEMACHINE_ERROR if unable to parse the input. If
+ * statemachine_parse() is called after an error situation was encountered
+ * the behaviour is unspecified.
+ */
+/* TODO(falmeida): change int size to size_t size */
+int statemachine_parse(statemachine_ctx *ctx, const char *str, int size)
+{
+ int i;
+ const int* const* state_table = ctx->definition->transition_table;
+ statemachine_definition *def;
+
+ assert(ctx !=NULL &&
+ ctx->definition != NULL &&
+ ctx->definition->transition_table != NULL);
+
+ if (size < 0) {
+ snprintf(ctx->error_msg, STATEMACHINE_MAX_STR_ERROR, "%s",
+ "Negative size in statemachine_parse().");
+ return STATEMACHINE_ERROR;
+ }
+
+ def = ctx->definition;
+
+ for (i = 0; i < size; i++) {
+ ctx->current_char = *str;
+ ctx->next_state =
+ state_table[ctx->current_state][CAST(unsigned char, *str)];
+ if (ctx->next_state == STATEMACHINE_ERROR) {
+ statemachine_set_transition_error_message(ctx);
+ return STATEMACHINE_ERROR;
+ }
+
+ if (ctx->current_state != ctx->next_state) {
+ if (def->exit_state_events[ctx->current_state])
+ def->exit_state_events[ctx->current_state](ctx,
+ ctx->current_state,
+ *str,
+ ctx->next_state);
+ }
+ if (ctx->current_state != ctx->next_state) {
+ if (def->enter_state_events[ctx->next_state])
+ def->enter_state_events[ctx->next_state](ctx,
+ ctx->current_state,
+ *str,
+ ctx->next_state);
+ }
+
+ if (def->in_state_events[ctx->next_state])
+ def->in_state_events[ctx->next_state](ctx,
+ ctx->current_state,
+ *str,
+ ctx->next_state);
+
+ /* We need two bytes left so we can NULL terminate the string. */
+ if (ctx->recording &&
+ STATEMACHINE_RECORD_BUFFER_SIZE - 1 > ctx->record_pos) {
+ ctx->record_buffer[ctx->record_pos++] = *str;
+ ctx->record_buffer[ctx->record_pos] = '\0';
+ }
+
+/* TODO(falmeida): Should clarify the contract here, since an event can change
+ * ctx->next_state and we need this functionality */
+
+ ctx->current_state = ctx->next_state;
+ ctx->column_number++;
+
+ if (*str == '\n') {
+ ctx->line_number++;
+ ctx->column_number = 1;
+ }
+ str++;
+ }
+
+ return ctx->current_state;
+}