summaryrefslogtreecommitdiff
path: root/streamhtmlparser/statemachine.h
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-14 22:30:39 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-14 22:30:39 +0200
commit2db452d3d57df4b91375c0176e3a9527dbbc537c (patch)
tree96289944f5195438c83895c31a8e530efbd4045a /streamhtmlparser/statemachine.h
parent3c903a9a1784edc375119f1db7992e94765c0fbf (diff)
downloadcrawler-2db452d3d57df4b91375c0176e3a9527dbbc537c.tar.gz
crawler-2db452d3d57df4b91375c0176e3a9527dbbc537c.tar.bz2
first working crawler
Diffstat (limited to 'streamhtmlparser/statemachine.h')
-rw-r--r--streamhtmlparser/statemachine.h224
1 files changed, 224 insertions, 0 deletions
diff --git a/streamhtmlparser/statemachine.h b/streamhtmlparser/statemachine.h
new file mode 100644
index 0000000..a05ffe7
--- /dev/null
+++ b/streamhtmlparser/statemachine.h
@@ -0,0 +1,224 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Filipe Almeida
+ */
+
+#ifndef STREAMHTMLPARSER_STATEMACHINE_H
+#define STREAMHTMLPARSER_STATEMACHINE_H
+
+/* TODO(falmeida): I'm not sure about these limits, but since right now we only
+ * have 24 states it should be fine */
+
+enum {
+ STATEMACHINE_ERROR = 127
+};
+
+#define STATEMACHINE_RECORD_BUFFER_SIZE 256
+
+#define STATEMACHINE_MAX_STR_ERROR 80
+
+struct statemachine_ctx_s;
+
+typedef void(*state_event_function)(struct statemachine_ctx_s *, int, char,
+ int);
+
+typedef struct statemachine_definition_s {
+ int num_states;
+ const int* const* transition_table;
+
+ /* Array containing the name of the states as a C string.
+ * This field is optional and if not in use it should be set to NULL.
+ */
+ const char* const* state_names;
+ state_event_function *in_state_events;
+ state_event_function *enter_state_events;
+ state_event_function *exit_state_events;
+} statemachine_definition;
+
+typedef struct statemachine_ctx_s {
+ int current_state;
+ int next_state;
+ statemachine_definition *definition;
+ char current_char;
+
+ /* Current line number. */
+ int line_number;
+
+ /* Current column number. */
+ int column_number;
+ char record_buffer[STATEMACHINE_RECORD_BUFFER_SIZE];
+ size_t record_pos;
+
+ /* True if we are recording the stream to record_buffer. */
+ int recording;
+
+ /* In case there was an error (we are in state STATEMACHINE_ERROR), it will
+ * contain a human readable description of the error.
+ */
+ char error_msg[STATEMACHINE_MAX_STR_ERROR];
+
+ /* Storage space for the layer above. */
+ void *user;
+} statemachine_ctx;
+
+/* Populates the statemachine definition.
+ *
+ * Receives a transition table and an optional array of state names. It uses
+ * this data to populate the state machine definition.
+ *
+ * The transition table structure is a list of lists of ints (int **). The
+ * outer list indexes the source state and the inner list contains the
+ * destination state for each of the possible input characters:
+ *
+ * const int* const* transitions[source][input] == destination.
+ *
+ * The optional argument state_names points to a list of strings containing
+ * human readable state names. These strings are used when reporting error
+ * messages.
+ */
+void statemachine_definition_populate(statemachine_definition *def,
+ const int* const* transition_table,
+ const char* const* state_names);
+
+void statemachine_in_state(statemachine_definition *def, int st,
+ state_event_function func);
+void statemachine_enter_state(statemachine_definition *def, int st,
+ state_event_function func);
+void statemachine_exit_state(statemachine_definition *def, int st,
+ state_event_function func);
+
+statemachine_definition *statemachine_definition_new(int states);
+void statemachine_definition_delete(statemachine_definition *def);
+
+int statemachine_get_state(statemachine_ctx *ctx);
+void statemachine_set_state(statemachine_ctx *ctx, int state);
+
+void statemachine_start_record(statemachine_ctx *ctx);
+const char *statemachine_stop_record(statemachine_ctx *ctx);
+const char *statemachine_record_buffer(statemachine_ctx *ctx);
+
+/* Returns the the number of characters currently stored in the record buffer.
+ */
+static inline size_t statemachine_record_length(statemachine_ctx *ctx) {
+ return ctx->record_pos + 1;
+}
+
+/* Return the current line number. */
+static inline int statemachine_get_line_number(statemachine_ctx *ctx) {
+ return ctx->line_number;
+}
+
+/* Set the current line number. */
+static inline void statemachine_set_line_number(statemachine_ctx *ctx,
+ int line) {
+ ctx->line_number = line;
+}
+
+/* Return the current column number. */
+static inline int statemachine_get_column_number(statemachine_ctx *ctx) {
+ return ctx->column_number;
+}
+
+/* Set the current column number. */
+static inline void statemachine_set_column_number(statemachine_ctx *ctx,
+ int column) {
+ ctx->column_number = column;
+}
+
+
+/* Retrieve a human readable error message in case an error occurred.
+ *
+ * NULL is returned if the parser didn't encounter an error.
+ */
+static inline const char *statemachine_get_error_msg(statemachine_ctx *ctx) {
+ if (ctx->next_state == STATEMACHINE_ERROR) {
+ return ctx->error_msg;
+ } else {
+ return NULL;
+ }
+}
+
+/* Reset the statemachine.
+ *
+ * The state is set to the initialization values. This includes setting the
+ * state to the default state (0), stopping recording and setting the line
+ * number to 1.
+ */
+void statemachine_reset(statemachine_ctx *ctx);
+
+/* Initializes a new statemachine. Receives a statemachine definition object
+ * that should have been initialized with statemachine_definition_new() and a
+ * user reference to be used by the caller.
+ *
+ * Returns NULL if initialization fails.
+ *
+ * Initialization failure is fatal, and if this function fails it may not
+ * deallocate all previsouly allocated memory.
+ */
+statemachine_ctx *statemachine_new(statemachine_definition *def,
+ void *user);
+
+/* Returns a pointer to a context which is a duplicate of the statemachine src.
+ * The statemachine definition and the user pointer have to be provided since
+ * these references are not owned by the statemachine itself.
+ */
+statemachine_ctx *statemachine_duplicate(statemachine_ctx *ctx,
+ statemachine_definition *def,
+ void *user);
+
+/* Copies the context of the statemachine pointed to by src to the statemachine
+ * provided by dst.
+ * The statemachine definition and the user pointer have to be provided since
+ * these references are not owned by the statemachine itself.
+ */
+void statemachine_copy(statemachine_ctx *dst,
+ statemachine_ctx *src,
+ statemachine_definition *def,
+ void *user);
+
+int statemachine_parse(statemachine_ctx *ctx, const char *str, int size);
+
+void statemachine_delete(statemachine_ctx *ctx);
+
+
+/*****
+ * The following functions are only exported for testing purposes and should
+ * be treated as private. */
+
+
+/* Encode the character as an escaped C string.
+ *
+ * Encode the character chr into the string output. Writes at most len
+ * characters to the output string but makes sure output is NULL terminated.
+ */
+void statemachine_encode_char(char chr, char *output, size_t len);
+
+#endif /* STREAMHTMLPARSER_STATEMACHINE_H */