From 2db452d3d57df4b91375c0176e3a9527dbbc537c Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Sat, 14 Jul 2012 22:30:39 +0200 Subject: first working crawler --- streamhtmlparser/GNUmakefile | 2 +- streamhtmlparser/htmlparser.h | 397 ++++++++++++++++++++++++++++++ streamhtmlparser/htmlparser_cpp.h | 322 ++++++++++++++++++++++++ streamhtmlparser/include/htmlparser.h | 397 ------------------------------ streamhtmlparser/include/htmlparser_cpp.h | 322 ------------------------ streamhtmlparser/include/jsparser.h | 163 ------------ streamhtmlparser/include/statemachine.h | 224 ----------------- streamhtmlparser/jsparser.h | 163 ++++++++++++ streamhtmlparser/statemachine.h | 224 +++++++++++++++++ 9 files changed, 1107 insertions(+), 1107 deletions(-) create mode 100644 streamhtmlparser/htmlparser.h create mode 100644 streamhtmlparser/htmlparser_cpp.h delete mode 100644 streamhtmlparser/include/htmlparser.h delete mode 100644 streamhtmlparser/include/htmlparser_cpp.h delete mode 100644 streamhtmlparser/include/jsparser.h delete mode 100644 streamhtmlparser/include/statemachine.h create mode 100644 streamhtmlparser/jsparser.h create mode 100644 streamhtmlparser/statemachine.h (limited to 'streamhtmlparser') diff --git a/streamhtmlparser/GNUmakefile b/streamhtmlparser/GNUmakefile index 100d8b2..ea5380d 100644 --- a/streamhtmlparser/GNUmakefile +++ b/streamhtmlparser/GNUmakefile @@ -9,7 +9,7 @@ INCLUDE_CFLAGS = INCLUDE_LDFLAGS = \ INCLUDE_DIRS = \ - -Iinclude + -I. INCLUDE_LIBS = diff --git a/streamhtmlparser/htmlparser.h b/streamhtmlparser/htmlparser.h new file mode 100644 index 0000000..58db4a5 --- /dev/null +++ b/streamhtmlparser/htmlparser.h @@ -0,0 +1,397 @@ +/* Copyright (c) 2007, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Filipe Almeida + */ + +#ifndef STREAMHTMLPARSER_HTMLPARSER_H +#define STREAMHTMLPARSER_HTMLPARSER_H + +#include "statemachine.h" +#include "jsparser.h" + +/* entity filter */ + +/* String sizes used in htmlparser and entityfilter structures including the + * NULL terminator. + */ +#define HTMLPARSER_MAX_STRING STATEMACHINE_RECORD_BUFFER_SIZE +#define HTMLPARSER_MAX_ENTITY_SIZE 10 + + +enum htmlparser_state_external_enum { + HTMLPARSER_STATE_TEXT, + HTMLPARSER_STATE_TAG, + HTMLPARSER_STATE_ATTR, + HTMLPARSER_STATE_VALUE, + HTMLPARSER_STATE_COMMENT, + HTMLPARSER_STATE_JS_FILE, + HTMLPARSER_STATE_CSS_FILE, + HTMLPARSER_STATE_ERROR +}; + +enum htmlparser_mode { + HTMLPARSER_MODE_HTML, + HTMLPARSER_MODE_JS, + HTMLPARSER_MODE_CSS, + HTMLPARSER_MODE_HTML_IN_TAG +}; + +enum htmlparser_attr_type { + HTMLPARSER_ATTR_NONE, + HTMLPARSER_ATTR_REGULAR, + HTMLPARSER_ATTR_URI, + HTMLPARSER_ATTR_JS, + HTMLPARSER_ATTR_STYLE +}; + + +/* TODO(falmeida): Maybe move some of these declaration to the .c and only keep + * a forward declaration in here, since these structures are private. + */ + +/* entityfilter context structure. + * + * The entity filter collection of routines provide a way to decode html + * entities from an html document in a streaming way. + * + * The html_process() function receives a character at a time from the input + * stream and returns 0 or more characters which should be appended to the + * resulting decoded document. + * + * Currently this collection of functions are only exported for testing purposes + * and shouldn't be called from outside of htmlparser.c. + * + * Since we really only use these functions with the very specific purpose of + * decoding html entities for javascript attributes, only a small subset of + * entities are supported: <, >, "e;, &, &apos, and the numeric + * character references for both decimal and hexadecimal. + */ +typedef struct entityfilter_ctx_s { + + /* Current position into the buffer. */ + int buffer_pos; + + /* True if currently processing an html entity. */ + int in_entity; + + /* Temporary character buffer that is used while processing html entities. + */ + char buffer[HTMLPARSER_MAX_ENTITY_SIZE]; + + /* String buffer returned to the application after we decoded an html + * entity. + */ + char output[HTMLPARSER_MAX_ENTITY_SIZE]; +} entityfilter_ctx; + +/* Resets the entityfilter to its initial state so it can be reused. + */ +void entityfilter_reset(entityfilter_ctx *ctx); + +/* Initializes a new entity filter object. + */ +entityfilter_ctx *entityfilter_new(void); + +/* Deallocates an entity filter object. + */ +void entityfilter_delete(entityfilter_ctx *ctx); + +/* Copies the context of the entityfilter pointed to by src to the entityfilter + * dst. + */ +void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src); + +/* Processes a character from the input stream and decodes any html entities + * in the accumulated buffer. + * + * Returns a reference to a string that points to an internal buffer. This + * buffer will be changed after every call to entityfilter_process(). As + * such this string should be duplicated before subsequent calls to + * entityfilter_process(). + */ +const char *entityfilter_process(entityfilter_ctx *ctx, char c); + + +/* html parser */ + +/* Stores the context of the html parser. + * If this structure is changed, htmlparser_new(), htmlparser_copy() and + * htmlparser_reset() should be updated accordingly. + */ +typedef struct htmlparser_ctx_s { + + /* Holds a reference to the statemachine context. */ + statemachine_ctx *statemachine; + + /* Holds a reference to the statemachine definition in use. Right now this is + * only used so we can deallocate it at the end. + * + * It should be readonly and contain the same values across jsparser + * instances. + */ + /* TODO(falmeida): Change statemachine_def to const. */ + statemachine_definition *statemachine_def; + + /* Holds a reference to the javascript parser. */ + jsparser_ctx *jsparser; + + /* Holds a reference to the entity filter. Used for decoding html entities + * inside javascript attributes. */ + entityfilter_ctx *entityfilter; + + /* Offset into the current attribute value where 0 is the first character in + * the value. */ + int value_index; + + /* True if currently processing javascript. */ + int in_js; + + /* Current tag name. */ + char tag[HTMLPARSER_MAX_STRING]; + + /* Current attribute name. */ + char attr[HTMLPARSER_MAX_STRING]; + + /* Contents of the current value capped to HTMLPARSER_MAX_STRING. */ + char value[HTMLPARSER_MAX_STRING]; + +} htmlparser_ctx; + +/* Resets the parser to its initial state and to the default mode, which + * is MODE_HTML. + * + * All internal context like tag name, attribute name or the state of the + * statemachine are reset to its original values as if the object was just + * created. + */ +void htmlparser_reset(htmlparser_ctx *ctx); + +/* Resets the parser to its initial state and changes the parser mode. + * All internal context like tag name, attribute name or the state of the + * statemachine are reset to their original values as if the object was just + * created. + * + * Available modes: + * HTMLPARSER_MODE_HTML - Parses html text + * HTMLPARSER_MODE_JS - Parses javascript files + * HTMLPARSER_MODE_CSS - Parses CSS files. No actual parsing is actually done + * but htmlparser_in_css() always returns true. + * HTMLPARSER_MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To + * be used in a template expanded in the + * following context: + * + */ +void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode); + +/* Initializes a new htmlparser instance. + * + * Returns a pointer to the new instance or NULL if the initialization fails. + * Initialization failure is fatal, and if this function fails it may not + * deallocate all previsouly allocated memory. + */ +htmlparser_ctx *htmlparser_new(void); + +/* Copies the context of the htmlparser pointed to by src to the htmlparser dst. + * + * Also copies over the instances of the state machine, the jsparser and the + * entity filter but not the statemachine definition since this one is read + * only. + */ +void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src); + +/* Receives an htmlparser context and returns the current html state. + * + * The return value will be one of the states of htmlparser_state_external_enum. + */ +int htmlparser_state(htmlparser_ctx *ctx); + +/* Parses the input html stream and returns the finishing state. + * + * Returns HTMLPARSER_ERROR if unable to parse the input. If htmlparser_parse() + * is called after an error situation was encountered the behaviour is + * unspecified. At this point, htmlparser_reset() or htmlparser_reset_mode() + * can be called to reset the state. + */ +int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size); + +/* Returns true if the parser is inside an attribute value and the value is + * surrounded by single or double quotes. */ +int htmlparser_is_attr_quoted(htmlparser_ctx *ctx); + +/* Returns true if the parser is currently in javascript. This can be a + * an attribute that takes javascript, a javascript block or the parser + * can just be in MODE_JS. */ +int htmlparser_in_js(htmlparser_ctx *ctx); + +/* Returns the current tag or NULL if not available or we haven't seen the + * entire tag yet. + * + * There is no stack implemented because we currently don't have a need for + * it, which means tag names are tracked only one level deep. + * + * This is better understood by looking at the following example: + * + * + * [tag=b] + * + * [tag=i] + * + * [tag=NULL] + * + * + * The tag is correctly filled inside the tag itself and before any new inner + * tag is closed, at which point the tag will be null. + * + * For our current purposes this is not a problem, but we may implement a tag + * tracking stack in the future for completeness. + * + */ +const char *htmlparser_tag(htmlparser_ctx *ctx); + +/* Returns the current attribute name if after an attribute name or in an + * attribute value. Returns NULL otherwise. */ +const char *htmlparser_attr(htmlparser_ctx *ctx); + +/* Returns the contents of the current attribute value. + * + * Returns NULL if not inside an attribute value. + */ +const char *htmlparser_value(htmlparser_ctx *ctx); + +/* Returns true if the parser is currently inside a CSS construct. + * + * Currently this can be either a STYLE tag, a STYLE attribute or the fact that + * the parser was reset in HTMLPARSER_MODE_CSS using + * htmlparser_reset_mode(). + */ +int htmlparser_in_css(htmlparser_ctx *ctx); + +/* Returns the current state of the javascript state machine. + * + * Currently only present for testing purposes. + */ +int htmlparser_js_state(htmlparser_ctx *ctx); + +/* Returns non-zero if currently inside a javascript string literal and zero + * otherwise. + */ +int htmlparser_is_js_quoted(htmlparser_ctx *ctx); + +/* Returns non-zero if currently inside an attribute value and zero otherwise. + */ +int htmlparser_value_index(htmlparser_ctx *ctx); + +/* Returns true if this is the first character of a url inside an attribute. + * + * This function can be used by an html sanitizer or auto escaping system as a + * hint that it should validate the url for a whitelist of protocol handlers and + * for well-formedness, or that it should just escape a component of it. + * + * For attributes that expect a URL, this will return true if we are at the + * first character of the URL, false otherwise. + * For most attributes, this is the same as checking that we are at the first + * character of the attribute value but it also works correctly for the + * "content" attribute of the "meta" tag where the URL follows some earlier + * content. + * e.g: + * + * For any other attributes, the result will always be false. + */ +int htmlparser_is_url_start(htmlparser_ctx *ctx); + +/* Returns the current attribute type. + * + * The attribute type can be one of: + * HTMLPARSER_ATTR_NONE - not inside an attribute. + * HTMLPARSER_ATTR_REGULAR - Inside a normal attribute. + * HTMLPARSER_ATTR_URI - Inside an attribute that accepts a uri. + * HTMLPARSER_ATTR_JS - Inside a javascript attribute. + * HTMLPARSER_ATTR_STYLE - Inside a css style attribute. + */ +int htmlparser_attr_type(htmlparser_ctx *ctx); + +/* Return the current line number. */ +int htmlparser_get_line_number(htmlparser_ctx *ctx); + +/* Set the current line number. */ +void htmlparser_set_line_number(htmlparser_ctx *ctx, int line); + +/* Return the current column number. */ +int htmlparser_get_column_number(htmlparser_ctx *ctx); + +/* Set the current column number. */ +void htmlparser_set_column_number(htmlparser_ctx *ctx, int column); + +/* Retrieve a human readable error message in case an error occurred. + * + * NULL is returned if the parser didn't encounter an error. + */ +const char *htmlparser_get_error_msg(htmlparser_ctx *ctx); + +/* Invoked by the caller when text is expanded by the caller. + * + * Should be invoked when a template directive that expands to content is + * executed but we don't provide this content to the parser itself. This changes + * the current state by following the default rule, ensuring we stay in sync + * with the template. + * + * Returns 1 if template directives are accepted for this state and 0 if they + * are not, which should result in an error condition. + * + * Right now the only case being handled are unquoted attribute values and it + * always returns 1. When insert_text() is called after the equals sign, we + * assume some text was consumed and we are now in the middle of the attribute + * value itself. Example: + * + * + * + * The template calls insert_text() when it encounters $HREF_VALUE. If it didn't + * the parser would only have seen the following html: + * + * + * + * and would interpret alt=alternate_text as the value of the href attribute. + */ +int htmlparser_insert_text(htmlparser_ctx *ctx); + +/* Deallocates an htmlparser context object. + */ +void htmlparser_delete(htmlparser_ctx *ctx); + +#define htmlparser_parse_chr(a,b) htmlparser_parse(a, &(b), 1); +#ifdef __cplusplus +#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, \ + static_cast(strlen(b))); +#else +#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, (int)strlen(b)); +#endif + +#endif /* STREAMHTMLPARSER_HTMLPARSER_H */ diff --git a/streamhtmlparser/htmlparser_cpp.h b/streamhtmlparser/htmlparser_cpp.h new file mode 100644 index 0000000..3802233 --- /dev/null +++ b/streamhtmlparser/htmlparser_cpp.h @@ -0,0 +1,322 @@ +// Copyright (c) 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// --- +// Author: Filipe Almeida +// +// c++ bindings for htmlparser. + +#ifndef STREAMHTMLPARSER_HTMLPARSER_CPP_H__ +#define STREAMHTMLPARSER_HTMLPARSER_CPP_H__ + +#include +#include +extern "C" { + #include "htmlparser.h" + #include "jsparser.h" +} + +namespace streamhtmlparser { + +class JavascriptParser { + public: + enum State { + STATE_TEXT = JSPARSER_STATE_TEXT, + STATE_Q = JSPARSER_STATE_Q, + STATE_DQ = JSPARSER_STATE_DQ, + STATE_REGEXP = JSPARSER_STATE_REGEXP, + STATE_COMMENT = JSPARSER_STATE_COMMENT + }; +}; + +class HtmlParser { + public: + + /* html states */ + enum State { + STATE_TEXT = HTMLPARSER_STATE_TEXT, + STATE_TAG = HTMLPARSER_STATE_TAG, + STATE_ATTR = HTMLPARSER_STATE_ATTR, + STATE_VALUE = HTMLPARSER_STATE_VALUE, + STATE_COMMENT = HTMLPARSER_STATE_COMMENT, + STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE, + STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE, + STATE_ERROR = HTMLPARSER_STATE_ERROR + }; + + /* attribute types */ + enum AttributeType { + ATTR_NONE = HTMLPARSER_ATTR_NONE, + ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR, + ATTR_URI = HTMLPARSER_ATTR_URI, + ATTR_JS = HTMLPARSER_ATTR_JS, + ATTR_STYLE = HTMLPARSER_ATTR_STYLE + }; + + /* Parser modes */ + enum Mode { + MODE_HTML = HTMLPARSER_MODE_HTML, + MODE_JS = HTMLPARSER_MODE_JS, + MODE_CSS = HTMLPARSER_MODE_CSS, + MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG + }; + + HtmlParser() { + parser_ = htmlparser_new(); + assert(parser_ != NULL); + }; + + /* Parses the input html stream and returns the finishing state. + * + * Returns HtmlParser::STATE_ERROR if unable to parse the input. If + * htmlparser_parse() is called after an error situation was encountered + * the behaviour is unspecified. At this point, Reset() or ResetMode() + * can be called to reset the state so it can be used to parse a new file. + */ + int Parse(const char *str, int len) { + return htmlparser_parse(parser_, str, len); + }; + + int Parse(const std::string &str) { + return Parse(str.c_str(), static_cast(str.length())); + }; + + /* Returns the current state the parser is in */ + int state() const { + return htmlparser_state(parser_); + }; + + /* Returns the current tag or NULL if not available. + * + * There is no stack implemented because we currently don't have a need for + * it, which means tag names are tracked only one level deep. + * + * This is better understood by looking at the following example: + * + * + * [tag=b] + * + * [tag=i] + * + * [tag=NULL] + * + * + * The tag is correctly filled inside the tag itself and before any new + * inner tag is closed, at which point the tag will be set to NULL. + * + * For our current purposes this is not a problem, but we may implement a + * tag tracking stack in the future for completeness. + */ + const char *tag() const { + return htmlparser_tag(parser_); + } + + /* Returns the current attribute name if inside an attribute name or an + * attribute value. Returns NULL otherwise. */ + const char *attribute() const { + return htmlparser_attr(parser_); + } + + /* Returns the contents of the current attribute value. */ + const char *value() const { + return htmlparser_value(parser_); + } + + /* Returns true if inside javascript. This can be a javascript block, a + * javascript attribute value or the parser may just be in javascript mode + * (HtmlParser::MODE_JS) */ + bool InJavascript() const { + return static_cast(htmlparser_in_js(parser_)); + } + + /* Returns true if the parser is currently inside a CSS construct. + * + * Currently this can be either a STYLE tag, a STYLE attribute or the fact + * that the parser was reset using MODE_CSS using ResetMode(). + */ + bool InCss() const { + return static_cast(htmlparser_in_css(parser_)); + } + + /* Returns true if the current attribute is quoted */ + bool IsAttributeQuoted() const { + return static_cast(htmlparser_is_attr_quoted(parser_)); + } + + /* Returns true if the parser is inside a js string literal. + */ + bool IsJavascriptQuoted() const { + return static_cast(htmlparser_is_js_quoted(parser_)); + } + + /* Returns the index within the current value or -1 if the parser is not + * inside an attribute value */ + int ValueIndex() const { + return htmlparser_value_index(parser_); + } + + /* Returns true if this is the first character of a url inside an attribute. + * + * This function can be used by an html sanitizer or auto escaping system as + * a hint that it should validate the url for a whitelist of protocol + * handlers and for well-formedness, or that it should just escape a + * component of it. + * + * For attributes that expect a url this will return true if we are at the + * first character of the attribute, but for the special case of a meta + * redirect tag some analysis is made in order to verify if we are at the + * start of a url or not. + * + * For any other attributes, the result will always be false. + * + */ + bool IsUrlStart() const { + return htmlparser_is_url_start(parser_); + } + + /* Returns the current attribute type. + * + * The attribute type can be one of: + * ATTR_NONE - not inside an attribute + * ATTR_REGULAR - Inside a normal attribute + * ATTR_URI - Inside an attribute that accepts a uri + * ATTR_JS - Inside a javascript attribute + * ATTR_STYLE - Inside a css style attribute + * */ + int AttributeType() const { + return htmlparser_attr_type(parser_); + } + + /* Return the current line number. */ + int line_number() const { + return htmlparser_get_line_number(parser_); + } + + /* Set the current line number. */ + void set_line_number(int line) { + return htmlparser_set_line_number(parser_, line); + } + + /* Return the current column number. */ + int column_number() const { + return htmlparser_get_column_number(parser_); + } + + /* Set the current line number. */ + void set_column_number(int column) { + return htmlparser_set_column_number(parser_, column); + } + + /* Retrieve a human readable error message in case an error occurred. + * + * NULL is returned if the parser didn't encounter an error. + */ + const char *GetErrorMessage() { + return htmlparser_get_error_msg(parser_); + } + + /* Returns the current state the javascript parser is in. + * + * Should only be used for testing. + */ + int javascript_state() const { + return htmlparser_js_state(parser_); + }; + + /* Resets the parser to it's initial state and changes the parser mode. + * + * Internal state (tag name, attribute name, state of statemachine) is + * reset as * though the object was just created. + * + * Available modes: + * MODE_HTML - Parses html text + * MODE_JS - Parses javascript files + * MODE_CSS - Parses CSS files. No actual parsing is actually done + * but InCss() always returns true. + * MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To + * be used in a template expanded in the + * following context: + */ + void ResetMode(enum Mode mode) { + return htmlparser_reset_mode(parser_, mode); + } + + /* Resets the parser to it's initial state and to the default mode, which is + * MODE_HTML. + * + * All internal context like tag name, attribute name or the state of the + * statemachine are reset to it's original values as if the object was just + * created. + */ + void Reset() { + return htmlparser_reset(parser_); + } + + /* Invoked when text is inserted by the caller. + * + * Should be called before a template directive that expands to content is + * found. This changes the current state by following the default rule, + * ensuring we stay in sync with template. + * + * Returns true if template directives are accepted for this state and + * false if they are not, which should result in an error condition. + * + * Right now the only case being handled are unquoted attribute values and + * it always returns true. In the future we can handle more cases and + * restrict the states were we allow template directives by returning false + * for those. + */ + bool InsertText() { + return static_cast(htmlparser_insert_text(parser_)); + } + + /* Copies the context of the HtmlParser object referenced in source to the + * current object. + */ + void CopyFrom(const HtmlParser *source) { + assert(this != source); + assert(source != NULL); + htmlparser_copy(parser_, source->parser_); + } + + ~HtmlParser() { + htmlparser_delete(parser_); + }; + + + private: + htmlparser_ctx *parser_; + HtmlParser(const HtmlParser&); // disallow copy + void operator=(const HtmlParser&); // and assign + +}; + +} + +#endif // STREAMHTMLPARSER_HTMLPARSER_CPP_H__ diff --git a/streamhtmlparser/include/htmlparser.h b/streamhtmlparser/include/htmlparser.h deleted file mode 100644 index 58db4a5..0000000 --- a/streamhtmlparser/include/htmlparser.h +++ /dev/null @@ -1,397 +0,0 @@ -/* Copyright (c) 2007, Google Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Google Inc. nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * --- - * Author: Filipe Almeida - */ - -#ifndef STREAMHTMLPARSER_HTMLPARSER_H -#define STREAMHTMLPARSER_HTMLPARSER_H - -#include "statemachine.h" -#include "jsparser.h" - -/* entity filter */ - -/* String sizes used in htmlparser and entityfilter structures including the - * NULL terminator. - */ -#define HTMLPARSER_MAX_STRING STATEMACHINE_RECORD_BUFFER_SIZE -#define HTMLPARSER_MAX_ENTITY_SIZE 10 - - -enum htmlparser_state_external_enum { - HTMLPARSER_STATE_TEXT, - HTMLPARSER_STATE_TAG, - HTMLPARSER_STATE_ATTR, - HTMLPARSER_STATE_VALUE, - HTMLPARSER_STATE_COMMENT, - HTMLPARSER_STATE_JS_FILE, - HTMLPARSER_STATE_CSS_FILE, - HTMLPARSER_STATE_ERROR -}; - -enum htmlparser_mode { - HTMLPARSER_MODE_HTML, - HTMLPARSER_MODE_JS, - HTMLPARSER_MODE_CSS, - HTMLPARSER_MODE_HTML_IN_TAG -}; - -enum htmlparser_attr_type { - HTMLPARSER_ATTR_NONE, - HTMLPARSER_ATTR_REGULAR, - HTMLPARSER_ATTR_URI, - HTMLPARSER_ATTR_JS, - HTMLPARSER_ATTR_STYLE -}; - - -/* TODO(falmeida): Maybe move some of these declaration to the .c and only keep - * a forward declaration in here, since these structures are private. - */ - -/* entityfilter context structure. - * - * The entity filter collection of routines provide a way to decode html - * entities from an html document in a streaming way. - * - * The html_process() function receives a character at a time from the input - * stream and returns 0 or more characters which should be appended to the - * resulting decoded document. - * - * Currently this collection of functions are only exported for testing purposes - * and shouldn't be called from outside of htmlparser.c. - * - * Since we really only use these functions with the very specific purpose of - * decoding html entities for javascript attributes, only a small subset of - * entities are supported: <, >, "e;, &, &apos, and the numeric - * character references for both decimal and hexadecimal. - */ -typedef struct entityfilter_ctx_s { - - /* Current position into the buffer. */ - int buffer_pos; - - /* True if currently processing an html entity. */ - int in_entity; - - /* Temporary character buffer that is used while processing html entities. - */ - char buffer[HTMLPARSER_MAX_ENTITY_SIZE]; - - /* String buffer returned to the application after we decoded an html - * entity. - */ - char output[HTMLPARSER_MAX_ENTITY_SIZE]; -} entityfilter_ctx; - -/* Resets the entityfilter to its initial state so it can be reused. - */ -void entityfilter_reset(entityfilter_ctx *ctx); - -/* Initializes a new entity filter object. - */ -entityfilter_ctx *entityfilter_new(void); - -/* Deallocates an entity filter object. - */ -void entityfilter_delete(entityfilter_ctx *ctx); - -/* Copies the context of the entityfilter pointed to by src to the entityfilter - * dst. - */ -void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src); - -/* Processes a character from the input stream and decodes any html entities - * in the accumulated buffer. - * - * Returns a reference to a string that points to an internal buffer. This - * buffer will be changed after every call to entityfilter_process(). As - * such this string should be duplicated before subsequent calls to - * entityfilter_process(). - */ -const char *entityfilter_process(entityfilter_ctx *ctx, char c); - - -/* html parser */ - -/* Stores the context of the html parser. - * If this structure is changed, htmlparser_new(), htmlparser_copy() and - * htmlparser_reset() should be updated accordingly. - */ -typedef struct htmlparser_ctx_s { - - /* Holds a reference to the statemachine context. */ - statemachine_ctx *statemachine; - - /* Holds a reference to the statemachine definition in use. Right now this is - * only used so we can deallocate it at the end. - * - * It should be readonly and contain the same values across jsparser - * instances. - */ - /* TODO(falmeida): Change statemachine_def to const. */ - statemachine_definition *statemachine_def; - - /* Holds a reference to the javascript parser. */ - jsparser_ctx *jsparser; - - /* Holds a reference to the entity filter. Used for decoding html entities - * inside javascript attributes. */ - entityfilter_ctx *entityfilter; - - /* Offset into the current attribute value where 0 is the first character in - * the value. */ - int value_index; - - /* True if currently processing javascript. */ - int in_js; - - /* Current tag name. */ - char tag[HTMLPARSER_MAX_STRING]; - - /* Current attribute name. */ - char attr[HTMLPARSER_MAX_STRING]; - - /* Contents of the current value capped to HTMLPARSER_MAX_STRING. */ - char value[HTMLPARSER_MAX_STRING]; - -} htmlparser_ctx; - -/* Resets the parser to its initial state and to the default mode, which - * is MODE_HTML. - * - * All internal context like tag name, attribute name or the state of the - * statemachine are reset to its original values as if the object was just - * created. - */ -void htmlparser_reset(htmlparser_ctx *ctx); - -/* Resets the parser to its initial state and changes the parser mode. - * All internal context like tag name, attribute name or the state of the - * statemachine are reset to their original values as if the object was just - * created. - * - * Available modes: - * HTMLPARSER_MODE_HTML - Parses html text - * HTMLPARSER_MODE_JS - Parses javascript files - * HTMLPARSER_MODE_CSS - Parses CSS files. No actual parsing is actually done - * but htmlparser_in_css() always returns true. - * HTMLPARSER_MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To - * be used in a template expanded in the - * following context: - * - */ -void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode); - -/* Initializes a new htmlparser instance. - * - * Returns a pointer to the new instance or NULL if the initialization fails. - * Initialization failure is fatal, and if this function fails it may not - * deallocate all previsouly allocated memory. - */ -htmlparser_ctx *htmlparser_new(void); - -/* Copies the context of the htmlparser pointed to by src to the htmlparser dst. - * - * Also copies over the instances of the state machine, the jsparser and the - * entity filter but not the statemachine definition since this one is read - * only. - */ -void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src); - -/* Receives an htmlparser context and returns the current html state. - * - * The return value will be one of the states of htmlparser_state_external_enum. - */ -int htmlparser_state(htmlparser_ctx *ctx); - -/* Parses the input html stream and returns the finishing state. - * - * Returns HTMLPARSER_ERROR if unable to parse the input. If htmlparser_parse() - * is called after an error situation was encountered the behaviour is - * unspecified. At this point, htmlparser_reset() or htmlparser_reset_mode() - * can be called to reset the state. - */ -int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size); - -/* Returns true if the parser is inside an attribute value and the value is - * surrounded by single or double quotes. */ -int htmlparser_is_attr_quoted(htmlparser_ctx *ctx); - -/* Returns true if the parser is currently in javascript. This can be a - * an attribute that takes javascript, a javascript block or the parser - * can just be in MODE_JS. */ -int htmlparser_in_js(htmlparser_ctx *ctx); - -/* Returns the current tag or NULL if not available or we haven't seen the - * entire tag yet. - * - * There is no stack implemented because we currently don't have a need for - * it, which means tag names are tracked only one level deep. - * - * This is better understood by looking at the following example: - * - * - * [tag=b] - * - * [tag=i] - * - * [tag=NULL] - * - * - * The tag is correctly filled inside the tag itself and before any new inner - * tag is closed, at which point the tag will be null. - * - * For our current purposes this is not a problem, but we may implement a tag - * tracking stack in the future for completeness. - * - */ -const char *htmlparser_tag(htmlparser_ctx *ctx); - -/* Returns the current attribute name if after an attribute name or in an - * attribute value. Returns NULL otherwise. */ -const char *htmlparser_attr(htmlparser_ctx *ctx); - -/* Returns the contents of the current attribute value. - * - * Returns NULL if not inside an attribute value. - */ -const char *htmlparser_value(htmlparser_ctx *ctx); - -/* Returns true if the parser is currently inside a CSS construct. - * - * Currently this can be either a STYLE tag, a STYLE attribute or the fact that - * the parser was reset in HTMLPARSER_MODE_CSS using - * htmlparser_reset_mode(). - */ -int htmlparser_in_css(htmlparser_ctx *ctx); - -/* Returns the current state of the javascript state machine. - * - * Currently only present for testing purposes. - */ -int htmlparser_js_state(htmlparser_ctx *ctx); - -/* Returns non-zero if currently inside a javascript string literal and zero - * otherwise. - */ -int htmlparser_is_js_quoted(htmlparser_ctx *ctx); - -/* Returns non-zero if currently inside an attribute value and zero otherwise. - */ -int htmlparser_value_index(htmlparser_ctx *ctx); - -/* Returns true if this is the first character of a url inside an attribute. - * - * This function can be used by an html sanitizer or auto escaping system as a - * hint that it should validate the url for a whitelist of protocol handlers and - * for well-formedness, or that it should just escape a component of it. - * - * For attributes that expect a URL, this will return true if we are at the - * first character of the URL, false otherwise. - * For most attributes, this is the same as checking that we are at the first - * character of the attribute value but it also works correctly for the - * "content" attribute of the "meta" tag where the URL follows some earlier - * content. - * e.g: - * - * For any other attributes, the result will always be false. - */ -int htmlparser_is_url_start(htmlparser_ctx *ctx); - -/* Returns the current attribute type. - * - * The attribute type can be one of: - * HTMLPARSER_ATTR_NONE - not inside an attribute. - * HTMLPARSER_ATTR_REGULAR - Inside a normal attribute. - * HTMLPARSER_ATTR_URI - Inside an attribute that accepts a uri. - * HTMLPARSER_ATTR_JS - Inside a javascript attribute. - * HTMLPARSER_ATTR_STYLE - Inside a css style attribute. - */ -int htmlparser_attr_type(htmlparser_ctx *ctx); - -/* Return the current line number. */ -int htmlparser_get_line_number(htmlparser_ctx *ctx); - -/* Set the current line number. */ -void htmlparser_set_line_number(htmlparser_ctx *ctx, int line); - -/* Return the current column number. */ -int htmlparser_get_column_number(htmlparser_ctx *ctx); - -/* Set the current column number. */ -void htmlparser_set_column_number(htmlparser_ctx *ctx, int column); - -/* Retrieve a human readable error message in case an error occurred. - * - * NULL is returned if the parser didn't encounter an error. - */ -const char *htmlparser_get_error_msg(htmlparser_ctx *ctx); - -/* Invoked by the caller when text is expanded by the caller. - * - * Should be invoked when a template directive that expands to content is - * executed but we don't provide this content to the parser itself. This changes - * the current state by following the default rule, ensuring we stay in sync - * with the template. - * - * Returns 1 if template directives are accepted for this state and 0 if they - * are not, which should result in an error condition. - * - * Right now the only case being handled are unquoted attribute values and it - * always returns 1. When insert_text() is called after the equals sign, we - * assume some text was consumed and we are now in the middle of the attribute - * value itself. Example: - * - * - * - * The template calls insert_text() when it encounters $HREF_VALUE. If it didn't - * the parser would only have seen the following html: - * - * - * - * and would interpret alt=alternate_text as the value of the href attribute. - */ -int htmlparser_insert_text(htmlparser_ctx *ctx); - -/* Deallocates an htmlparser context object. - */ -void htmlparser_delete(htmlparser_ctx *ctx); - -#define htmlparser_parse_chr(a,b) htmlparser_parse(a, &(b), 1); -#ifdef __cplusplus -#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, \ - static_cast(strlen(b))); -#else -#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, (int)strlen(b)); -#endif - -#endif /* STREAMHTMLPARSER_HTMLPARSER_H */ diff --git a/streamhtmlparser/include/htmlparser_cpp.h b/streamhtmlparser/include/htmlparser_cpp.h deleted file mode 100644 index 3802233..0000000 --- a/streamhtmlparser/include/htmlparser_cpp.h +++ /dev/null @@ -1,322 +0,0 @@ -// Copyright (c) 2008, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// --- -// Author: Filipe Almeida -// -// c++ bindings for htmlparser. - -#ifndef STREAMHTMLPARSER_HTMLPARSER_CPP_H__ -#define STREAMHTMLPARSER_HTMLPARSER_CPP_H__ - -#include -#include -extern "C" { - #include "htmlparser.h" - #include "jsparser.h" -} - -namespace streamhtmlparser { - -class JavascriptParser { - public: - enum State { - STATE_TEXT = JSPARSER_STATE_TEXT, - STATE_Q = JSPARSER_STATE_Q, - STATE_DQ = JSPARSER_STATE_DQ, - STATE_REGEXP = JSPARSER_STATE_REGEXP, - STATE_COMMENT = JSPARSER_STATE_COMMENT - }; -}; - -class HtmlParser { - public: - - /* html states */ - enum State { - STATE_TEXT = HTMLPARSER_STATE_TEXT, - STATE_TAG = HTMLPARSER_STATE_TAG, - STATE_ATTR = HTMLPARSER_STATE_ATTR, - STATE_VALUE = HTMLPARSER_STATE_VALUE, - STATE_COMMENT = HTMLPARSER_STATE_COMMENT, - STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE, - STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE, - STATE_ERROR = HTMLPARSER_STATE_ERROR - }; - - /* attribute types */ - enum AttributeType { - ATTR_NONE = HTMLPARSER_ATTR_NONE, - ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR, - ATTR_URI = HTMLPARSER_ATTR_URI, - ATTR_JS = HTMLPARSER_ATTR_JS, - ATTR_STYLE = HTMLPARSER_ATTR_STYLE - }; - - /* Parser modes */ - enum Mode { - MODE_HTML = HTMLPARSER_MODE_HTML, - MODE_JS = HTMLPARSER_MODE_JS, - MODE_CSS = HTMLPARSER_MODE_CSS, - MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG - }; - - HtmlParser() { - parser_ = htmlparser_new(); - assert(parser_ != NULL); - }; - - /* Parses the input html stream and returns the finishing state. - * - * Returns HtmlParser::STATE_ERROR if unable to parse the input. If - * htmlparser_parse() is called after an error situation was encountered - * the behaviour is unspecified. At this point, Reset() or ResetMode() - * can be called to reset the state so it can be used to parse a new file. - */ - int Parse(const char *str, int len) { - return htmlparser_parse(parser_, str, len); - }; - - int Parse(const std::string &str) { - return Parse(str.c_str(), static_cast(str.length())); - }; - - /* Returns the current state the parser is in */ - int state() const { - return htmlparser_state(parser_); - }; - - /* Returns the current tag or NULL if not available. - * - * There is no stack implemented because we currently don't have a need for - * it, which means tag names are tracked only one level deep. - * - * This is better understood by looking at the following example: - * - * - * [tag=b] - * - * [tag=i] - * - * [tag=NULL] - * - * - * The tag is correctly filled inside the tag itself and before any new - * inner tag is closed, at which point the tag will be set to NULL. - * - * For our current purposes this is not a problem, but we may implement a - * tag tracking stack in the future for completeness. - */ - const char *tag() const { - return htmlparser_tag(parser_); - } - - /* Returns the current attribute name if inside an attribute name or an - * attribute value. Returns NULL otherwise. */ - const char *attribute() const { - return htmlparser_attr(parser_); - } - - /* Returns the contents of the current attribute value. */ - const char *value() const { - return htmlparser_value(parser_); - } - - /* Returns true if inside javascript. This can be a javascript block, a - * javascript attribute value or the parser may just be in javascript mode - * (HtmlParser::MODE_JS) */ - bool InJavascript() const { - return static_cast(htmlparser_in_js(parser_)); - } - - /* Returns true if the parser is currently inside a CSS construct. - * - * Currently this can be either a STYLE tag, a STYLE attribute or the fact - * that the parser was reset using MODE_CSS using ResetMode(). - */ - bool InCss() const { - return static_cast(htmlparser_in_css(parser_)); - } - - /* Returns true if the current attribute is quoted */ - bool IsAttributeQuoted() const { - return static_cast(htmlparser_is_attr_quoted(parser_)); - } - - /* Returns true if the parser is inside a js string literal. - */ - bool IsJavascriptQuoted() const { - return static_cast(htmlparser_is_js_quoted(parser_)); - } - - /* Returns the index within the current value or -1 if the parser is not - * inside an attribute value */ - int ValueIndex() const { - return htmlparser_value_index(parser_); - } - - /* Returns true if this is the first character of a url inside an attribute. - * - * This function can be used by an html sanitizer or auto escaping system as - * a hint that it should validate the url for a whitelist of protocol - * handlers and for well-formedness, or that it should just escape a - * component of it. - * - * For attributes that expect a url this will return true if we are at the - * first character of the attribute, but for the special case of a meta - * redirect tag some analysis is made in order to verify if we are at the - * start of a url or not. - * - * For any other attributes, the result will always be false. - * - */ - bool IsUrlStart() const { - return htmlparser_is_url_start(parser_); - } - - /* Returns the current attribute type. - * - * The attribute type can be one of: - * ATTR_NONE - not inside an attribute - * ATTR_REGULAR - Inside a normal attribute - * ATTR_URI - Inside an attribute that accepts a uri - * ATTR_JS - Inside a javascript attribute - * ATTR_STYLE - Inside a css style attribute - * */ - int AttributeType() const { - return htmlparser_attr_type(parser_); - } - - /* Return the current line number. */ - int line_number() const { - return htmlparser_get_line_number(parser_); - } - - /* Set the current line number. */ - void set_line_number(int line) { - return htmlparser_set_line_number(parser_, line); - } - - /* Return the current column number. */ - int column_number() const { - return htmlparser_get_column_number(parser_); - } - - /* Set the current line number. */ - void set_column_number(int column) { - return htmlparser_set_column_number(parser_, column); - } - - /* Retrieve a human readable error message in case an error occurred. - * - * NULL is returned if the parser didn't encounter an error. - */ - const char *GetErrorMessage() { - return htmlparser_get_error_msg(parser_); - } - - /* Returns the current state the javascript parser is in. - * - * Should only be used for testing. - */ - int javascript_state() const { - return htmlparser_js_state(parser_); - }; - - /* Resets the parser to it's initial state and changes the parser mode. - * - * Internal state (tag name, attribute name, state of statemachine) is - * reset as * though the object was just created. - * - * Available modes: - * MODE_HTML - Parses html text - * MODE_JS - Parses javascript files - * MODE_CSS - Parses CSS files. No actual parsing is actually done - * but InCss() always returns true. - * MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To - * be used in a template expanded in the - * following context: - */ - void ResetMode(enum Mode mode) { - return htmlparser_reset_mode(parser_, mode); - } - - /* Resets the parser to it's initial state and to the default mode, which is - * MODE_HTML. - * - * All internal context like tag name, attribute name or the state of the - * statemachine are reset to it's original values as if the object was just - * created. - */ - void Reset() { - return htmlparser_reset(parser_); - } - - /* Invoked when text is inserted by the caller. - * - * Should be called before a template directive that expands to content is - * found. This changes the current state by following the default rule, - * ensuring we stay in sync with template. - * - * Returns true if template directives are accepted for this state and - * false if they are not, which should result in an error condition. - * - * Right now the only case being handled are unquoted attribute values and - * it always returns true. In the future we can handle more cases and - * restrict the states were we allow template directives by returning false - * for those. - */ - bool InsertText() { - return static_cast(htmlparser_insert_text(parser_)); - } - - /* Copies the context of the HtmlParser object referenced in source to the - * current object. - */ - void CopyFrom(const HtmlParser *source) { - assert(this != source); - assert(source != NULL); - htmlparser_copy(parser_, source->parser_); - } - - ~HtmlParser() { - htmlparser_delete(parser_); - }; - - - private: - htmlparser_ctx *parser_; - HtmlParser(const HtmlParser&); // disallow copy - void operator=(const HtmlParser&); // and assign - -}; - -} - -#endif // STREAMHTMLPARSER_HTMLPARSER_CPP_H__ diff --git a/streamhtmlparser/include/jsparser.h b/streamhtmlparser/include/jsparser.h deleted file mode 100644 index 4077aa4..0000000 --- a/streamhtmlparser/include/jsparser.h +++ /dev/null @@ -1,163 +0,0 @@ -/* Copyright (c) 2007, Google Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Google Inc. nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * --- - * Author: Filipe Almeida - */ - -#ifndef STREAMHTMLPARSER_JSPARSER_H -#define STREAMHTMLPARSER_JSPARSER_H - -#include "statemachine.h" - -/* Size of the ring buffer used to lookup the last token in the javascript - * stream. The size is pretty much arbitrary at this point but must be bigger - * than the biggest token we want to lookup plus 3: Two delimiters plus an empty - * ring buffer slot. */ -#define JSPARSER_RING_BUFFER_SIZE 18 - -enum js_state_external_enum { - JSPARSER_STATE_TEXT, - JSPARSER_STATE_Q, - JSPARSER_STATE_DQ, - JSPARSER_STATE_REGEXP, - JSPARSER_STATE_COMMENT -}; - -/* Stores the context of the javascript parser. - * - * If this structure is changed, jsparser_new(), jsparser_copy() and - * jsparser_reset() should be updated accordingly. - */ -typedef struct jsparser_ctx_s { - - /* Reference to the statemachine context. */ - statemachine_ctx *statemachine; - - /* Reference to the statemachine definition. - * - * It should be readonly and contain the same values across jsparser - * instances. - */ - /* TODO(falmeida): Change statemachine_def to const. */ - statemachine_definition *statemachine_def; - - /* Index to the start of the buffer. */ - int buffer_start; - - /* Index the current writing position (end of the buffer plus one). */ - int buffer_end; - - /* Ring buffer used to lookup the last token. */ - char buffer[JSPARSER_RING_BUFFER_SIZE]; - -} jsparser_ctx; - - -void jsparser_reset(jsparser_ctx *ctx); -jsparser_ctx *jsparser_new(void); - -/* Returns a pointer to a context which is a duplicate of the jsparser src. - */ -jsparser_ctx *jsparser_duplicate(jsparser_ctx *src); - -/* Copies the context of the jsparser pointed to by src to the jsparser dst. - */ -void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src); -int jsparser_state(jsparser_ctx *ctx); -int jsparser_parse(jsparser_ctx *ctx, const char *str, int size); - -void jsparser_delete(jsparser_ctx *ctx); - -/** - * Ring buffer functions. - * - * These functions are only exported for testing and should not be called from - * outside of jsparser.c in production code. - */ - -/* Appends a character to the ring buffer. - * - * Sequences of whitespaces and newlines are folded into one character. - */ -void jsparser_buffer_append_chr(jsparser_ctx *js, char chr); - -/* Appends a string to the ring buffer. - * - * Sequences of whitespaces and newlines are folded into one character. - */ -void jsparser_buffer_append_str(jsparser_ctx *js, const char *str); - -/* Returns the last appended character and removes it from the buffer. If the - * buffer is empty, then it returns ASCII 0 ('\0'). - */ -char jsparser_buffer_pop(jsparser_ctx *js); - -/* Returns the value of the character at a certain index in the buffer or an - * ASCII 0 ('\0') character if the index is extends beyond the size of the - * buffer, either because we don't have as many characters in the buffer, or - * because the index points to a place bigger than the size of the buffer.. - * - * Index positions must be negative, where -1 is the last character appended to - * the buffer. - */ -char jsparser_buffer_get(jsparser_ctx *js, int pos); - -/* Sets the value of the character at a certain index in the buffer. Returns - * true if the write was successful or false if there was an attempt to write - * outside of the buffer boundaries. - * - * Index positions are negative, were -1 is the last character appended to the - * buffer. Using positive integers for the index will result in undefined - * behaviour. - */ -int jsparser_buffer_set(jsparser_ctx *js, int pos, char value); - -/* Copies a slice of the buffer to the string pointed to by output. start and - * end are the indexes of the sliced region. If the start argument extends - * beyond the beginning of the buffer, the slice will only contain characters - * starting from beginning of the buffer. - */ -void jsparser_buffer_slice(jsparser_ctx *js, char *buffer, int start, int end); - -/* Copy the last javascript identifier or keyword found in the buffer to the - * string pointed by identifier. - */ -int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier); - - -#define jsparser_parse_chr(a,b) jsparser_parse(a, &(b), 1); -#ifdef __cplusplus -#define jsparser_parse_str(a,b) jsparser_parse(a, b, \ - static_cast(strlen(b))); -#else -#define jsparser_parse_str(a,b) jsparser_parse(a, b, (int)strlen(b)); -#endif - -#endif /* STREAMHTMLPARSER_JSPARSER_H */ diff --git a/streamhtmlparser/include/statemachine.h b/streamhtmlparser/include/statemachine.h deleted file mode 100644 index a05ffe7..0000000 --- a/streamhtmlparser/include/statemachine.h +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2007, Google Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following disclaimer - * in the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Google Inc. nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * --- - * Author: Filipe Almeida - */ - -#ifndef STREAMHTMLPARSER_STATEMACHINE_H -#define STREAMHTMLPARSER_STATEMACHINE_H - -/* TODO(falmeida): I'm not sure about these limits, but since right now we only - * have 24 states it should be fine */ - -enum { - STATEMACHINE_ERROR = 127 -}; - -#define STATEMACHINE_RECORD_BUFFER_SIZE 256 - -#define STATEMACHINE_MAX_STR_ERROR 80 - -struct statemachine_ctx_s; - -typedef void(*state_event_function)(struct statemachine_ctx_s *, int, char, - int); - -typedef struct statemachine_definition_s { - int num_states; - const int* const* transition_table; - - /* Array containing the name of the states as a C string. - * This field is optional and if not in use it should be set to NULL. - */ - const char* const* state_names; - state_event_function *in_state_events; - state_event_function *enter_state_events; - state_event_function *exit_state_events; -} statemachine_definition; - -typedef struct statemachine_ctx_s { - int current_state; - int next_state; - statemachine_definition *definition; - char current_char; - - /* Current line number. */ - int line_number; - - /* Current column number. */ - int column_number; - char record_buffer[STATEMACHINE_RECORD_BUFFER_SIZE]; - size_t record_pos; - - /* True if we are recording the stream to record_buffer. */ - int recording; - - /* In case there was an error (we are in state STATEMACHINE_ERROR), it will - * contain a human readable description of the error. - */ - char error_msg[STATEMACHINE_MAX_STR_ERROR]; - - /* Storage space for the layer above. */ - void *user; -} statemachine_ctx; - -/* Populates the statemachine definition. - * - * Receives a transition table and an optional array of state names. It uses - * this data to populate the state machine definition. - * - * The transition table structure is a list of lists of ints (int **). The - * outer list indexes the source state and the inner list contains the - * destination state for each of the possible input characters: - * - * const int* const* transitions[source][input] == destination. - * - * The optional argument state_names points to a list of strings containing - * human readable state names. These strings are used when reporting error - * messages. - */ -void statemachine_definition_populate(statemachine_definition *def, - const int* const* transition_table, - const char* const* state_names); - -void statemachine_in_state(statemachine_definition *def, int st, - state_event_function func); -void statemachine_enter_state(statemachine_definition *def, int st, - state_event_function func); -void statemachine_exit_state(statemachine_definition *def, int st, - state_event_function func); - -statemachine_definition *statemachine_definition_new(int states); -void statemachine_definition_delete(statemachine_definition *def); - -int statemachine_get_state(statemachine_ctx *ctx); -void statemachine_set_state(statemachine_ctx *ctx, int state); - -void statemachine_start_record(statemachine_ctx *ctx); -const char *statemachine_stop_record(statemachine_ctx *ctx); -const char *statemachine_record_buffer(statemachine_ctx *ctx); - -/* Returns the the number of characters currently stored in the record buffer. - */ -static inline size_t statemachine_record_length(statemachine_ctx *ctx) { - return ctx->record_pos + 1; -} - -/* Return the current line number. */ -static inline int statemachine_get_line_number(statemachine_ctx *ctx) { - return ctx->line_number; -} - -/* Set the current line number. */ -static inline void statemachine_set_line_number(statemachine_ctx *ctx, - int line) { - ctx->line_number = line; -} - -/* Return the current column number. */ -static inline int statemachine_get_column_number(statemachine_ctx *ctx) { - return ctx->column_number; -} - -/* Set the current column number. */ -static inline void statemachine_set_column_number(statemachine_ctx *ctx, - int column) { - ctx->column_number = column; -} - - -/* Retrieve a human readable error message in case an error occurred. - * - * NULL is returned if the parser didn't encounter an error. - */ -static inline const char *statemachine_get_error_msg(statemachine_ctx *ctx) { - if (ctx->next_state == STATEMACHINE_ERROR) { - return ctx->error_msg; - } else { - return NULL; - } -} - -/* Reset the statemachine. - * - * The state is set to the initialization values. This includes setting the - * state to the default state (0), stopping recording and setting the line - * number to 1. - */ -void statemachine_reset(statemachine_ctx *ctx); - -/* Initializes a new statemachine. Receives a statemachine definition object - * that should have been initialized with statemachine_definition_new() and a - * user reference to be used by the caller. - * - * Returns NULL if initialization fails. - * - * Initialization failure is fatal, and if this function fails it may not - * deallocate all previsouly allocated memory. - */ -statemachine_ctx *statemachine_new(statemachine_definition *def, - void *user); - -/* Returns a pointer to a context which is a duplicate of the statemachine src. - * The statemachine definition and the user pointer have to be provided since - * these references are not owned by the statemachine itself. - */ -statemachine_ctx *statemachine_duplicate(statemachine_ctx *ctx, - statemachine_definition *def, - void *user); - -/* Copies the context of the statemachine pointed to by src to the statemachine - * provided by dst. - * The statemachine definition and the user pointer have to be provided since - * these references are not owned by the statemachine itself. - */ -void statemachine_copy(statemachine_ctx *dst, - statemachine_ctx *src, - statemachine_definition *def, - void *user); - -int statemachine_parse(statemachine_ctx *ctx, const char *str, int size); - -void statemachine_delete(statemachine_ctx *ctx); - - -/***** - * The following functions are only exported for testing purposes and should - * be treated as private. */ - - -/* Encode the character as an escaped C string. - * - * Encode the character chr into the string output. Writes at most len - * characters to the output string but makes sure output is NULL terminated. - */ -void statemachine_encode_char(char chr, char *output, size_t len); - -#endif /* STREAMHTMLPARSER_STATEMACHINE_H */ diff --git a/streamhtmlparser/jsparser.h b/streamhtmlparser/jsparser.h new file mode 100644 index 0000000..4077aa4 --- /dev/null +++ b/streamhtmlparser/jsparser.h @@ -0,0 +1,163 @@ +/* Copyright (c) 2007, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Filipe Almeida + */ + +#ifndef STREAMHTMLPARSER_JSPARSER_H +#define STREAMHTMLPARSER_JSPARSER_H + +#include "statemachine.h" + +/* Size of the ring buffer used to lookup the last token in the javascript + * stream. The size is pretty much arbitrary at this point but must be bigger + * than the biggest token we want to lookup plus 3: Two delimiters plus an empty + * ring buffer slot. */ +#define JSPARSER_RING_BUFFER_SIZE 18 + +enum js_state_external_enum { + JSPARSER_STATE_TEXT, + JSPARSER_STATE_Q, + JSPARSER_STATE_DQ, + JSPARSER_STATE_REGEXP, + JSPARSER_STATE_COMMENT +}; + +/* Stores the context of the javascript parser. + * + * If this structure is changed, jsparser_new(), jsparser_copy() and + * jsparser_reset() should be updated accordingly. + */ +typedef struct jsparser_ctx_s { + + /* Reference to the statemachine context. */ + statemachine_ctx *statemachine; + + /* Reference to the statemachine definition. + * + * It should be readonly and contain the same values across jsparser + * instances. + */ + /* TODO(falmeida): Change statemachine_def to const. */ + statemachine_definition *statemachine_def; + + /* Index to the start of the buffer. */ + int buffer_start; + + /* Index the current writing position (end of the buffer plus one). */ + int buffer_end; + + /* Ring buffer used to lookup the last token. */ + char buffer[JSPARSER_RING_BUFFER_SIZE]; + +} jsparser_ctx; + + +void jsparser_reset(jsparser_ctx *ctx); +jsparser_ctx *jsparser_new(void); + +/* Returns a pointer to a context which is a duplicate of the jsparser src. + */ +jsparser_ctx *jsparser_duplicate(jsparser_ctx *src); + +/* Copies the context of the jsparser pointed to by src to the jsparser dst. + */ +void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src); +int jsparser_state(jsparser_ctx *ctx); +int jsparser_parse(jsparser_ctx *ctx, const char *str, int size); + +void jsparser_delete(jsparser_ctx *ctx); + +/** + * Ring buffer functions. + * + * These functions are only exported for testing and should not be called from + * outside of jsparser.c in production code. + */ + +/* Appends a character to the ring buffer. + * + * Sequences of whitespaces and newlines are folded into one character. + */ +void jsparser_buffer_append_chr(jsparser_ctx *js, char chr); + +/* Appends a string to the ring buffer. + * + * Sequences of whitespaces and newlines are folded into one character. + */ +void jsparser_buffer_append_str(jsparser_ctx *js, const char *str); + +/* Returns the last appended character and removes it from the buffer. If the + * buffer is empty, then it returns ASCII 0 ('\0'). + */ +char jsparser_buffer_pop(jsparser_ctx *js); + +/* Returns the value of the character at a certain index in the buffer or an + * ASCII 0 ('\0') character if the index is extends beyond the size of the + * buffer, either because we don't have as many characters in the buffer, or + * because the index points to a place bigger than the size of the buffer.. + * + * Index positions must be negative, where -1 is the last character appended to + * the buffer. + */ +char jsparser_buffer_get(jsparser_ctx *js, int pos); + +/* Sets the value of the character at a certain index in the buffer. Returns + * true if the write was successful or false if there was an attempt to write + * outside of the buffer boundaries. + * + * Index positions are negative, were -1 is the last character appended to the + * buffer. Using positive integers for the index will result in undefined + * behaviour. + */ +int jsparser_buffer_set(jsparser_ctx *js, int pos, char value); + +/* Copies a slice of the buffer to the string pointed to by output. start and + * end are the indexes of the sliced region. If the start argument extends + * beyond the beginning of the buffer, the slice will only contain characters + * starting from beginning of the buffer. + */ +void jsparser_buffer_slice(jsparser_ctx *js, char *buffer, int start, int end); + +/* Copy the last javascript identifier or keyword found in the buffer to the + * string pointed by identifier. + */ +int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier); + + +#define jsparser_parse_chr(a,b) jsparser_parse(a, &(b), 1); +#ifdef __cplusplus +#define jsparser_parse_str(a,b) jsparser_parse(a, b, \ + static_cast(strlen(b))); +#else +#define jsparser_parse_str(a,b) jsparser_parse(a, b, (int)strlen(b)); +#endif + +#endif /* STREAMHTMLPARSER_JSPARSER_H */ diff --git a/streamhtmlparser/statemachine.h b/streamhtmlparser/statemachine.h new file mode 100644 index 0000000..a05ffe7 --- /dev/null +++ b/streamhtmlparser/statemachine.h @@ -0,0 +1,224 @@ +/* Copyright (c) 2007, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Filipe Almeida + */ + +#ifndef STREAMHTMLPARSER_STATEMACHINE_H +#define STREAMHTMLPARSER_STATEMACHINE_H + +/* TODO(falmeida): I'm not sure about these limits, but since right now we only + * have 24 states it should be fine */ + +enum { + STATEMACHINE_ERROR = 127 +}; + +#define STATEMACHINE_RECORD_BUFFER_SIZE 256 + +#define STATEMACHINE_MAX_STR_ERROR 80 + +struct statemachine_ctx_s; + +typedef void(*state_event_function)(struct statemachine_ctx_s *, int, char, + int); + +typedef struct statemachine_definition_s { + int num_states; + const int* const* transition_table; + + /* Array containing the name of the states as a C string. + * This field is optional and if not in use it should be set to NULL. + */ + const char* const* state_names; + state_event_function *in_state_events; + state_event_function *enter_state_events; + state_event_function *exit_state_events; +} statemachine_definition; + +typedef struct statemachine_ctx_s { + int current_state; + int next_state; + statemachine_definition *definition; + char current_char; + + /* Current line number. */ + int line_number; + + /* Current column number. */ + int column_number; + char record_buffer[STATEMACHINE_RECORD_BUFFER_SIZE]; + size_t record_pos; + + /* True if we are recording the stream to record_buffer. */ + int recording; + + /* In case there was an error (we are in state STATEMACHINE_ERROR), it will + * contain a human readable description of the error. + */ + char error_msg[STATEMACHINE_MAX_STR_ERROR]; + + /* Storage space for the layer above. */ + void *user; +} statemachine_ctx; + +/* Populates the statemachine definition. + * + * Receives a transition table and an optional array of state names. It uses + * this data to populate the state machine definition. + * + * The transition table structure is a list of lists of ints (int **). The + * outer list indexes the source state and the inner list contains the + * destination state for each of the possible input characters: + * + * const int* const* transitions[source][input] == destination. + * + * The optional argument state_names points to a list of strings containing + * human readable state names. These strings are used when reporting error + * messages. + */ +void statemachine_definition_populate(statemachine_definition *def, + const int* const* transition_table, + const char* const* state_names); + +void statemachine_in_state(statemachine_definition *def, int st, + state_event_function func); +void statemachine_enter_state(statemachine_definition *def, int st, + state_event_function func); +void statemachine_exit_state(statemachine_definition *def, int st, + state_event_function func); + +statemachine_definition *statemachine_definition_new(int states); +void statemachine_definition_delete(statemachine_definition *def); + +int statemachine_get_state(statemachine_ctx *ctx); +void statemachine_set_state(statemachine_ctx *ctx, int state); + +void statemachine_start_record(statemachine_ctx *ctx); +const char *statemachine_stop_record(statemachine_ctx *ctx); +const char *statemachine_record_buffer(statemachine_ctx *ctx); + +/* Returns the the number of characters currently stored in the record buffer. + */ +static inline size_t statemachine_record_length(statemachine_ctx *ctx) { + return ctx->record_pos + 1; +} + +/* Return the current line number. */ +static inline int statemachine_get_line_number(statemachine_ctx *ctx) { + return ctx->line_number; +} + +/* Set the current line number. */ +static inline void statemachine_set_line_number(statemachine_ctx *ctx, + int line) { + ctx->line_number = line; +} + +/* Return the current column number. */ +static inline int statemachine_get_column_number(statemachine_ctx *ctx) { + return ctx->column_number; +} + +/* Set the current column number. */ +static inline void statemachine_set_column_number(statemachine_ctx *ctx, + int column) { + ctx->column_number = column; +} + + +/* Retrieve a human readable error message in case an error occurred. + * + * NULL is returned if the parser didn't encounter an error. + */ +static inline const char *statemachine_get_error_msg(statemachine_ctx *ctx) { + if (ctx->next_state == STATEMACHINE_ERROR) { + return ctx->error_msg; + } else { + return NULL; + } +} + +/* Reset the statemachine. + * + * The state is set to the initialization values. This includes setting the + * state to the default state (0), stopping recording and setting the line + * number to 1. + */ +void statemachine_reset(statemachine_ctx *ctx); + +/* Initializes a new statemachine. Receives a statemachine definition object + * that should have been initialized with statemachine_definition_new() and a + * user reference to be used by the caller. + * + * Returns NULL if initialization fails. + * + * Initialization failure is fatal, and if this function fails it may not + * deallocate all previsouly allocated memory. + */ +statemachine_ctx *statemachine_new(statemachine_definition *def, + void *user); + +/* Returns a pointer to a context which is a duplicate of the statemachine src. + * The statemachine definition and the user pointer have to be provided since + * these references are not owned by the statemachine itself. + */ +statemachine_ctx *statemachine_duplicate(statemachine_ctx *ctx, + statemachine_definition *def, + void *user); + +/* Copies the context of the statemachine pointed to by src to the statemachine + * provided by dst. + * The statemachine definition and the user pointer have to be provided since + * these references are not owned by the statemachine itself. + */ +void statemachine_copy(statemachine_ctx *dst, + statemachine_ctx *src, + statemachine_definition *def, + void *user); + +int statemachine_parse(statemachine_ctx *ctx, const char *str, int size); + +void statemachine_delete(statemachine_ctx *ctx); + + +/***** + * The following functions are only exported for testing purposes and should + * be treated as private. */ + + +/* Encode the character as an escaped C string. + * + * Encode the character chr into the string output. Writes at most len + * characters to the output string but makes sure output is NULL terminated. + */ +void statemachine_encode_char(char chr, char *output, size_t len); + +#endif /* STREAMHTMLPARSER_STATEMACHINE_H */ -- cgit v1.2.3-54-g00ecf