1 files changed, 397 insertions, 0 deletions
diff --git a/streamhtmlparser/htmlparser.h b/streamhtmlparser/htmlparser.h
new file mode 100644
index 0000000..58db4a5
--- /dev/null
+++ b/streamhtmlparser/htmlparser.h
@@ -0,0 +1,397 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Filipe Almeida
+ */
+
+#ifndef STREAMHTMLPARSER_HTMLPARSER_H
+#define STREAMHTMLPARSER_HTMLPARSER_H
+
+#include "statemachine.h"
+#include "jsparser.h"
+
+/* entity filter */
+
+/* String sizes used in htmlparser and entityfilter structures including the
+ * NULL terminator.
+ */
+#define HTMLPARSER_MAX_STRING STATEMACHINE_RECORD_BUFFER_SIZE
+#define HTMLPARSER_MAX_ENTITY_SIZE 10
+
+
+enum htmlparser_state_external_enum {
+    HTMLPARSER_STATE_TEXT,
+    HTMLPARSER_STATE_TAG,
+    HTMLPARSER_STATE_ATTR,
+    HTMLPARSER_STATE_VALUE,
+    HTMLPARSER_STATE_COMMENT,
+    HTMLPARSER_STATE_JS_FILE,
+    HTMLPARSER_STATE_CSS_FILE,
+    HTMLPARSER_STATE_ERROR
+};
+
+enum htmlparser_mode {
+    HTMLPARSER_MODE_HTML,
+    HTMLPARSER_MODE_JS,
+    HTMLPARSER_MODE_CSS,
+    HTMLPARSER_MODE_HTML_IN_TAG
+};
+
+enum htmlparser_attr_type {
+    HTMLPARSER_ATTR_NONE,
+    HTMLPARSER_ATTR_REGULAR,
+    HTMLPARSER_ATTR_URI,
+    HTMLPARSER_ATTR_JS,
+    HTMLPARSER_ATTR_STYLE
+};
+
+
+/* TODO(falmeida): Maybe move some of these declaration to the .c and only keep
+ * a forward declaration in here, since these structures are private.
+ */
+
+/* entityfilter context structure.
+ *
+ * The entity filter collection of routines provide a way to decode html
+ * entities from an html document in a streaming way.
+ *
+ * The html_process() function receives a character at a time from the input
+ * stream and returns 0 or more characters which should be appended to the
+ * resulting decoded document.
+ *
+ * Currently this collection of functions are only exported for testing purposes
+ * and shouldn't be called from outside of htmlparser.c.
+ *
+ * Since we really only use these functions with the very specific purpose of
+ * decoding html entities for javascript attributes, only a small subset of
+ * entities are supported: &lt;, &gt;, &quote;, &amp;, &apos, and the numeric
+ * character references for both decimal and hexadecimal.
+ */
+typedef struct entityfilter_ctx_s {
+
+    /* Current position into the buffer. */
+    int buffer_pos;
+
+    /* True if currently processing an html entity. */
+    int in_entity;
+
+    /* Temporary character buffer that is used while processing html entities.
+     */
+    char buffer[HTMLPARSER_MAX_ENTITY_SIZE];
+
+    /* String buffer returned to the application after we decoded an html
+     * entity.
+     */
+    char output[HTMLPARSER_MAX_ENTITY_SIZE];
+} entityfilter_ctx;
+
+/* Resets the entityfilter to its initial state so it can be reused.
+ */
+void entityfilter_reset(entityfilter_ctx *ctx);
+
+/* Initializes a new entity filter object.
+ */
+entityfilter_ctx *entityfilter_new(void);
+
+/* Deallocates an entity filter object.
+ */
+void entityfilter_delete(entityfilter_ctx *ctx);
+
+/* Copies the context of the entityfilter pointed to by src to the entityfilter
+ * dst.
+ */
+void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src);
+
+/* Processes a character from the input stream and decodes any html entities
+ * in the accumulated buffer.
+ *
+ * Returns a reference to a string that points to an internal buffer. This
+ * buffer will be changed after every call to entityfilter_process(). As
+ * such this string should be duplicated before subsequent calls to
+ * entityfilter_process().
+ */
+const char *entityfilter_process(entityfilter_ctx *ctx, char c);
+
+
+/* html parser */
+
+/* Stores the context of the html parser.
+ * If this structure is changed, htmlparser_new(), htmlparser_copy() and
+ * htmlparser_reset() should be updated accordingly.
+ */
+typedef struct htmlparser_ctx_s {
+
+  /* Holds a reference to the statemachine context. */
+  statemachine_ctx *statemachine;
+
+  /* Holds a reference to the statemachine definition in use. Right now this is
+   * only used so we can deallocate it at the end.
+   *
+   * It should be readonly and contain the same values across jsparser
+   * instances.
+   */
+  /* TODO(falmeida): Change statemachine_def to const. */
+  statemachine_definition *statemachine_def;
+
+  /* Holds a reference to the javascript parser. */
+  jsparser_ctx *jsparser;
+
+  /* Holds a reference to the entity filter. Used for decoding html entities
+   * inside javascript attributes. */
+  entityfilter_ctx *entityfilter;
+
+  /* Offset into the current attribute value where 0 is the first character in
+   * the value. */
+  int value_index;
+
+  /* True if currently processing javascript. */
+  int in_js;
+
+  /* Current tag name. */
+  char tag[HTMLPARSER_MAX_STRING];
+
+  /* Current attribute name. */
+  char attr[HTMLPARSER_MAX_STRING];
+
+  /* Contents of the current value capped to HTMLPARSER_MAX_STRING. */
+  char value[HTMLPARSER_MAX_STRING];
+
+} htmlparser_ctx;
+
+/* Resets the parser to its initial state and to the default mode, which
+ * is MODE_HTML.
+ *
+ * All internal context like tag name, attribute name or the state of the
+ * statemachine are reset to its original values as if the object was just
+ * created.
+ */
+void htmlparser_reset(htmlparser_ctx *ctx);
+
+/* Resets the parser to its initial state and changes the parser mode.
+ * All internal context like tag name, attribute name or the state of the
+ * statemachine are reset to their original values as if the object was just
+ * created.
+ *
+ * Available modes:
+ *  HTMLPARSER_MODE_HTML - Parses html text
+ *  HTMLPARSER_MODE_JS - Parses javascript files
+ *  HTMLPARSER_MODE_CSS - Parses CSS files. No actual parsing is actually done
+ *                        but htmlparser_in_css() always returns true.
+ *  HTMLPARSER_MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
+ *                                be used in a template expanded in the
+ *                                following context: <a $template>
+ *
+ */
+void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode);
+
+/* Initializes a new htmlparser instance.
+ *
+ * Returns a pointer to the new instance or NULL if the initialization fails.
+ * Initialization failure is fatal, and if this function fails it may not
+ * deallocate all previsouly allocated memory.
+ */
+htmlparser_ctx *htmlparser_new(void);
+
+/* Copies the context of the htmlparser pointed to by src to the htmlparser dst.
+ *
+ * Also copies over the instances of the state machine, the jsparser and the
+ * entity filter but not the statemachine definition since this one is read
+ * only.
+ */
+void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src);
+
+/* Receives an htmlparser context and returns the current html state.
+ *
+ * The return value will be one of the states of htmlparser_state_external_enum.
+ */
+int htmlparser_state(htmlparser_ctx *ctx);
+
+/* Parses the input html stream and returns the finishing state.
+ *
+ * Returns HTMLPARSER_ERROR if unable to parse the input. If htmlparser_parse()
+ * is called after an error situation was encountered the behaviour is
+ * unspecified. At this point, htmlparser_reset() or htmlparser_reset_mode()
+ * can be called to reset the state.
+ */
+int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size);
+
+/* Returns true if the parser is inside an attribute value and the value is
+ * surrounded by single or double quotes. */
+int htmlparser_is_attr_quoted(htmlparser_ctx *ctx);
+
+/* Returns true if the parser is currently in javascript. This can be a
+ * an attribute that takes javascript, a javascript block or the parser
+ * can just be in MODE_JS. */
+int htmlparser_in_js(htmlparser_ctx *ctx);
+
+/* Returns the current tag or NULL if not available or we haven't seen the
+ * entire tag yet.
+ *
+ * There is no stack implemented because we currently don't have a need for
+ * it, which means tag names are tracked only one level deep.
+ *
+ * This is better understood by looking at the following example:
+ *
+ * <b [tag=b]>
+ *   [tag=b]
+ *   <i>
+ *    [tag=i]
+ *   </i>
+ *  [tag=NULL]
+ * </b>
+ *
+ * The tag is correctly filled inside the tag itself and before any new inner
+ * tag is closed, at which point the tag will be null.
+ *
+ * For our current purposes this is not a problem, but we may implement a tag
+ * tracking stack in the future for completeness.
+ *
+ */
+const char *htmlparser_tag(htmlparser_ctx *ctx);
+
+/* Returns the current attribute name if after an attribute name or in an
+ * attribute value. Returns NULL otherwise. */
+const char *htmlparser_attr(htmlparser_ctx *ctx);
+
+/* Returns the contents of the current attribute value.
+ *
+ * Returns NULL if not inside an attribute value.
+ */
+const char *htmlparser_value(htmlparser_ctx *ctx);
+
+/* Returns true if the parser is currently inside a CSS construct.
+ *
+ * Currently this can be either a STYLE tag, a STYLE attribute or the fact that
+ * the parser was reset in HTMLPARSER_MODE_CSS using
+ * htmlparser_reset_mode().
+ */
+int htmlparser_in_css(htmlparser_ctx *ctx);
+
+/* Returns the current state of the javascript state machine.
+ *
+ * Currently only present for testing purposes.
+ */
+int htmlparser_js_state(htmlparser_ctx *ctx);
+
+/* Returns non-zero if currently inside a javascript string literal and zero
+ * otherwise.
+ */
+int htmlparser_is_js_quoted(htmlparser_ctx *ctx);
+
+/* Returns non-zero if currently inside an attribute value and zero otherwise.
+ */
+int htmlparser_value_index(htmlparser_ctx *ctx);
+
+/* Returns true if this is the first character of a url inside an attribute.
+ *
+ * This function can be used by an html sanitizer or auto escaping system as a
+ * hint that it should validate the url for a whitelist of protocol handlers and
+ * for well-formedness, or that it should just escape a component of it.
+ *
+ * For attributes that expect a URL, this will return true if we are at the
+ * first character of the URL, false otherwise.
+ * For most attributes, this is the same as checking that we are at the first
+ * character of the attribute value but it also works correctly for the
+ * "content" attribute of the "meta" tag where the URL follows some earlier
+ * content.
+ * e.g: <meta http-equiv="refresh" * content="5; URL=http://bla.">
+ *
+ * For any other attributes, the result will always be false.
+ */
+int htmlparser_is_url_start(htmlparser_ctx *ctx);
+
+/* Returns the current attribute type.
+ *
+ * The attribute type can be one of:
+ *   HTMLPARSER_ATTR_NONE - not inside an attribute.
+ *   HTMLPARSER_ATTR_REGULAR - Inside a normal attribute.
+ *   HTMLPARSER_ATTR_URI - Inside an attribute that accepts a uri.
+ *   HTMLPARSER_ATTR_JS - Inside a javascript attribute.
+ *   HTMLPARSER_ATTR_STYLE - Inside a css style attribute.
+ */
+int htmlparser_attr_type(htmlparser_ctx *ctx);
+
+/* Return the current line number. */
+int htmlparser_get_line_number(htmlparser_ctx *ctx);
+
+/* Set the current line number. */
+void htmlparser_set_line_number(htmlparser_ctx *ctx, int line);
+
+/* Return the current column number. */
+int htmlparser_get_column_number(htmlparser_ctx *ctx);
+
+/* Set the current column number. */
+void htmlparser_set_column_number(htmlparser_ctx *ctx, int column);
+
+/* Retrieve a human readable error message in case an error occurred.
+ *
+ * NULL is returned if the parser didn't encounter an error.
+ */
+const char *htmlparser_get_error_msg(htmlparser_ctx *ctx);
+
+/* Invoked by the caller when text is expanded by the caller.
+ *
+ * Should be invoked when a template directive that expands to content is
+ * executed but we don't provide this content to the parser itself. This changes
+ * the current state by following the default rule, ensuring we stay in sync
+ * with the template.
+ *
+ * Returns 1 if template directives are accepted for this state and 0 if they
+ * are not, which should result in an error condition.
+ *
+ * Right now the only case being handled are unquoted attribute values and it
+ * always returns 1. When insert_text() is called after the equals sign, we
+ * assume some text was consumed and we are now in the middle of the attribute
+ * value itself. Example:
+ *
+ * <a href=$HREF_VALUE alt=alternate_text>
+ *
+ * The template calls insert_text() when it encounters $HREF_VALUE. If it didn't
+ * the parser would only have seen the following html:
+ *
+ * <a href= alt=alternate_text>
+ *
+ * and would interpret alt=alternate_text as the value of the href attribute.
+ */
+int htmlparser_insert_text(htmlparser_ctx *ctx);
+
+/* Deallocates an htmlparser context object.
+ */
+void htmlparser_delete(htmlparser_ctx *ctx);
+
+#define htmlparser_parse_chr(a,b) htmlparser_parse(a, &(b), 1);
+#ifdef __cplusplus
+#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, \
+                                                   static_cast<int>(strlen(b)));
+#else
+#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, (int)strlen(b));
+#endif
+
+#endif /* STREAMHTMLPARSER_HTMLPARSER_H */