From 2db452d3d57df4b91375c0176e3a9527dbbc537c Mon Sep 17 00:00:00 2001
From: Andreas Baumann <abaumann@yahoo.com>
Date: Sat, 14 Jul 2012 22:30:39 +0200
Subject: first working crawler

---
 streamhtmlparser/GNUmakefile              |   2 +-
 streamhtmlparser/htmlparser.h             | 397 ++++++++++++++++++++++++++++++
 streamhtmlparser/htmlparser_cpp.h         | 322 ++++++++++++++++++++++++
 streamhtmlparser/include/htmlparser.h     | 397 ------------------------------
 streamhtmlparser/include/htmlparser_cpp.h | 322 ------------------------
 streamhtmlparser/include/jsparser.h       | 163 ------------
 streamhtmlparser/include/statemachine.h   | 224 -----------------
 streamhtmlparser/jsparser.h               | 163 ++++++++++++
 streamhtmlparser/statemachine.h           | 224 +++++++++++++++++
 9 files changed, 1107 insertions(+), 1107 deletions(-)
 create mode 100644 streamhtmlparser/htmlparser.h
 create mode 100644 streamhtmlparser/htmlparser_cpp.h
 delete mode 100644 streamhtmlparser/include/htmlparser.h
 delete mode 100644 streamhtmlparser/include/htmlparser_cpp.h
 delete mode 100644 streamhtmlparser/include/jsparser.h
 delete mode 100644 streamhtmlparser/include/statemachine.h
 create mode 100644 streamhtmlparser/jsparser.h
 create mode 100644 streamhtmlparser/statemachine.h

(limited to 'streamhtmlparser')

diff --git a/streamhtmlparser/GNUmakefile b/streamhtmlparser/GNUmakefile
index 100d8b2..ea5380d 100644
--- a/streamhtmlparser/GNUmakefile
+++ b/streamhtmlparser/GNUmakefile
@@ -9,7 +9,7 @@ INCLUDE_CFLAGS =
 INCLUDE_LDFLAGS = \
 
 INCLUDE_DIRS = \
-	-Iinclude
+	-I.
 
 INCLUDE_LIBS =
 
diff --git a/streamhtmlparser/htmlparser.h b/streamhtmlparser/htmlparser.h
new file mode 100644
index 0000000..58db4a5
--- /dev/null
+++ b/streamhtmlparser/htmlparser.h
@@ -0,0 +1,397 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Filipe Almeida
+ */
+
+#ifndef STREAMHTMLPARSER_HTMLPARSER_H
+#define STREAMHTMLPARSER_HTMLPARSER_H
+
+#include "statemachine.h"
+#include "jsparser.h"
+
+/* entity filter */
+
+/* String sizes used in htmlparser and entityfilter structures including the
+ * NULL terminator.
+ */
+#define HTMLPARSER_MAX_STRING STATEMACHINE_RECORD_BUFFER_SIZE
+#define HTMLPARSER_MAX_ENTITY_SIZE 10
+
+
+enum htmlparser_state_external_enum {
+    HTMLPARSER_STATE_TEXT,
+    HTMLPARSER_STATE_TAG,
+    HTMLPARSER_STATE_ATTR,
+    HTMLPARSER_STATE_VALUE,
+    HTMLPARSER_STATE_COMMENT,
+    HTMLPARSER_STATE_JS_FILE,
+    HTMLPARSER_STATE_CSS_FILE,
+    HTMLPARSER_STATE_ERROR
+};
+
+enum htmlparser_mode {
+    HTMLPARSER_MODE_HTML,
+    HTMLPARSER_MODE_JS,
+    HTMLPARSER_MODE_CSS,
+    HTMLPARSER_MODE_HTML_IN_TAG
+};
+
+enum htmlparser_attr_type {
+    HTMLPARSER_ATTR_NONE,
+    HTMLPARSER_ATTR_REGULAR,
+    HTMLPARSER_ATTR_URI,
+    HTMLPARSER_ATTR_JS,
+    HTMLPARSER_ATTR_STYLE
+};
+
+
+/* TODO(falmeida): Maybe move some of these declaration to the .c and only keep
+ * a forward declaration in here, since these structures are private.
+ */
+
+/* entityfilter context structure.
+ *
+ * The entity filter collection of routines provide a way to decode html
+ * entities from an html document in a streaming way.
+ *
+ * The html_process() function receives a character at a time from the input
+ * stream and returns 0 or more characters which should be appended to the
+ * resulting decoded document.
+ *
+ * Currently this collection of functions are only exported for testing purposes
+ * and shouldn't be called from outside of htmlparser.c.
+ *
+ * Since we really only use these functions with the very specific purpose of
+ * decoding html entities for javascript attributes, only a small subset of
+ * entities are supported: &lt;, &gt;, &quote;, &amp;, &apos, and the numeric
+ * character references for both decimal and hexadecimal.
+ */
+typedef struct entityfilter_ctx_s {
+
+    /* Current position into the buffer. */
+    int buffer_pos;
+
+    /* True if currently processing an html entity. */
+    int in_entity;
+
+    /* Temporary character buffer that is used while processing html entities.
+     */
+    char buffer[HTMLPARSER_MAX_ENTITY_SIZE];
+
+    /* String buffer returned to the application after we decoded an html
+     * entity.
+     */
+    char output[HTMLPARSER_MAX_ENTITY_SIZE];
+} entityfilter_ctx;
+
+/* Resets the entityfilter to its initial state so it can be reused.
+ */
+void entityfilter_reset(entityfilter_ctx *ctx);
+
+/* Initializes a new entity filter object.
+ */
+entityfilter_ctx *entityfilter_new(void);
+
+/* Deallocates an entity filter object.
+ */
+void entityfilter_delete(entityfilter_ctx *ctx);
+
+/* Copies the context of the entityfilter pointed to by src to the entityfilter
+ * dst.
+ */
+void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src);
+
+/* Processes a character from the input stream and decodes any html entities
+ * in the accumulated buffer.
+ *
+ * Returns a reference to a string that points to an internal buffer. This
+ * buffer will be changed after every call to entityfilter_process(). As
+ * such this string should be duplicated before subsequent calls to
+ * entityfilter_process().
+ */
+const char *entityfilter_process(entityfilter_ctx *ctx, char c);
+
+
+/* html parser */
+
+/* Stores the context of the html parser.
+ * If this structure is changed, htmlparser_new(), htmlparser_copy() and
+ * htmlparser_reset() should be updated accordingly.
+ */
+typedef struct htmlparser_ctx_s {
+
+  /* Holds a reference to the statemachine context. */
+  statemachine_ctx *statemachine;
+
+  /* Holds a reference to the statemachine definition in use. Right now this is
+   * only used so we can deallocate it at the end.
+   *
+   * It should be readonly and contain the same values across jsparser
+   * instances.
+   */
+  /* TODO(falmeida): Change statemachine_def to const. */
+  statemachine_definition *statemachine_def;
+
+  /* Holds a reference to the javascript parser. */
+  jsparser_ctx *jsparser;
+
+  /* Holds a reference to the entity filter. Used for decoding html entities
+   * inside javascript attributes. */
+  entityfilter_ctx *entityfilter;
+
+  /* Offset into the current attribute value where 0 is the first character in
+   * the value. */
+  int value_index;
+
+  /* True if currently processing javascript. */
+  int in_js;
+
+  /* Current tag name. */
+  char tag[HTMLPARSER_MAX_STRING];
+
+  /* Current attribute name. */
+  char attr[HTMLPARSER_MAX_STRING];
+
+  /* Contents of the current value capped to HTMLPARSER_MAX_STRING. */
+  char value[HTMLPARSER_MAX_STRING];
+
+} htmlparser_ctx;
+
+/* Resets the parser to its initial state and to the default mode, which
+ * is MODE_HTML.
+ *
+ * All internal context like tag name, attribute name or the state of the
+ * statemachine are reset to its original values as if the object was just
+ * created.
+ */
+void htmlparser_reset(htmlparser_ctx *ctx);
+
+/* Resets the parser to its initial state and changes the parser mode.
+ * All internal context like tag name, attribute name or the state of the
+ * statemachine are reset to their original values as if the object was just
+ * created.
+ *
+ * Available modes:
+ *  HTMLPARSER_MODE_HTML - Parses html text
+ *  HTMLPARSER_MODE_JS - Parses javascript files
+ *  HTMLPARSER_MODE_CSS - Parses CSS files. No actual parsing is actually done
+ *                        but htmlparser_in_css() always returns true.
+ *  HTMLPARSER_MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
+ *                                be used in a template expanded in the
+ *                                following context: <a $template>
+ *
+ */
+void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode);
+
+/* Initializes a new htmlparser instance.
+ *
+ * Returns a pointer to the new instance or NULL if the initialization fails.
+ * Initialization failure is fatal, and if this function fails it may not
+ * deallocate all previsouly allocated memory.
+ */
+htmlparser_ctx *htmlparser_new(void);
+
+/* Copies the context of the htmlparser pointed to by src to the htmlparser dst.
+ *
+ * Also copies over the instances of the state machine, the jsparser and the
+ * entity filter but not the statemachine definition since this one is read
+ * only.
+ */
+void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src);
+
+/* Receives an htmlparser context and returns the current html state.
+ *
+ * The return value will be one of the states of htmlparser_state_external_enum.
+ */
+int htmlparser_state(htmlparser_ctx *ctx);
+
+/* Parses the input html stream and returns the finishing state.
+ *
+ * Returns HTMLPARSER_ERROR if unable to parse the input. If htmlparser_parse()
+ * is called after an error situation was encountered the behaviour is
+ * unspecified. At this point, htmlparser_reset() or htmlparser_reset_mode()
+ * can be called to reset the state.
+ */
+int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size);
+
+/* Returns true if the parser is inside an attribute value and the value is
+ * surrounded by single or double quotes. */
+int htmlparser_is_attr_quoted(htmlparser_ctx *ctx);
+
+/* Returns true if the parser is currently in javascript. This can be a
+ * an attribute that takes javascript, a javascript block or the parser
+ * can just be in MODE_JS. */
+int htmlparser_in_js(htmlparser_ctx *ctx);
+
+/* Returns the current tag or NULL if not available or we haven't seen the
+ * entire tag yet.
+ *
+ * There is no stack implemented because we currently don't have a need for
+ * it, which means tag names are tracked only one level deep.
+ *
+ * This is better understood by looking at the following example:
+ *
+ * <b [tag=b]>
+ *   [tag=b]
+ *   <i>
+ *    [tag=i]
+ *   </i>
+ *  [tag=NULL]
+ * </b>
+ *
+ * The tag is correctly filled inside the tag itself and before any new inner
+ * tag is closed, at which point the tag will be null.
+ *
+ * For our current purposes this is not a problem, but we may implement a tag
+ * tracking stack in the future for completeness.
+ *
+ */
+const char *htmlparser_tag(htmlparser_ctx *ctx);
+
+/* Returns the current attribute name if after an attribute name or in an
+ * attribute value. Returns NULL otherwise. */
+const char *htmlparser_attr(htmlparser_ctx *ctx);
+
+/* Returns the contents of the current attribute value.
+ *
+ * Returns NULL if not inside an attribute value.
+ */
+const char *htmlparser_value(htmlparser_ctx *ctx);
+
+/* Returns true if the parser is currently inside a CSS construct.
+ *
+ * Currently this can be either a STYLE tag, a STYLE attribute or the fact that
+ * the parser was reset in HTMLPARSER_MODE_CSS using
+ * htmlparser_reset_mode().
+ */
+int htmlparser_in_css(htmlparser_ctx *ctx);
+
+/* Returns the current state of the javascript state machine.
+ *
+ * Currently only present for testing purposes.
+ */
+int htmlparser_js_state(htmlparser_ctx *ctx);
+
+/* Returns non-zero if currently inside a javascript string literal and zero
+ * otherwise.
+ */
+int htmlparser_is_js_quoted(htmlparser_ctx *ctx);
+
+/* Returns non-zero if currently inside an attribute value and zero otherwise.
+ */
+int htmlparser_value_index(htmlparser_ctx *ctx);
+
+/* Returns true if this is the first character of a url inside an attribute.
+ *
+ * This function can be used by an html sanitizer or auto escaping system as a
+ * hint that it should validate the url for a whitelist of protocol handlers and
+ * for well-formedness, or that it should just escape a component of it.
+ *
+ * For attributes that expect a URL, this will return true if we are at the
+ * first character of the URL, false otherwise.
+ * For most attributes, this is the same as checking that we are at the first
+ * character of the attribute value but it also works correctly for the
+ * "content" attribute of the "meta" tag where the URL follows some earlier
+ * content.
+ * e.g: <meta http-equiv="refresh" * content="5; URL=http://bla.">
+ *
+ * For any other attributes, the result will always be false.
+ */
+int htmlparser_is_url_start(htmlparser_ctx *ctx);
+
+/* Returns the current attribute type.
+ *
+ * The attribute type can be one of:
+ *   HTMLPARSER_ATTR_NONE - not inside an attribute.
+ *   HTMLPARSER_ATTR_REGULAR - Inside a normal attribute.
+ *   HTMLPARSER_ATTR_URI - Inside an attribute that accepts a uri.
+ *   HTMLPARSER_ATTR_JS - Inside a javascript attribute.
+ *   HTMLPARSER_ATTR_STYLE - Inside a css style attribute.
+ */
+int htmlparser_attr_type(htmlparser_ctx *ctx);
+
+/* Return the current line number. */
+int htmlparser_get_line_number(htmlparser_ctx *ctx);
+
+/* Set the current line number. */
+void htmlparser_set_line_number(htmlparser_ctx *ctx, int line);
+
+/* Return the current column number. */
+int htmlparser_get_column_number(htmlparser_ctx *ctx);
+
+/* Set the current column number. */
+void htmlparser_set_column_number(htmlparser_ctx *ctx, int column);
+
+/* Retrieve a human readable error message in case an error occurred.
+ *
+ * NULL is returned if the parser didn't encounter an error.
+ */
+const char *htmlparser_get_error_msg(htmlparser_ctx *ctx);
+
+/* Invoked by the caller when text is expanded by the caller.
+ *
+ * Should be invoked when a template directive that expands to content is
+ * executed but we don't provide this content to the parser itself. This changes
+ * the current state by following the default rule, ensuring we stay in sync
+ * with the template.
+ *
+ * Returns 1 if template directives are accepted for this state and 0 if they
+ * are not, which should result in an error condition.
+ *
+ * Right now the only case being handled are unquoted attribute values and it
+ * always returns 1. When insert_text() is called after the equals sign, we
+ * assume some text was consumed and we are now in the middle of the attribute
+ * value itself. Example:
+ *
+ * <a href=$HREF_VALUE alt=alternate_text>
+ *
+ * The template calls insert_text() when it encounters $HREF_VALUE. If it didn't
+ * the parser would only have seen the following html:
+ *
+ * <a href= alt=alternate_text>
+ *
+ * and would interpret alt=alternate_text as the value of the href attribute.
+ */
+int htmlparser_insert_text(htmlparser_ctx *ctx);
+
+/* Deallocates an htmlparser context object.
+ */
+void htmlparser_delete(htmlparser_ctx *ctx);
+
+#define htmlparser_parse_chr(a,b) htmlparser_parse(a, &(b), 1);
+#ifdef __cplusplus
+#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, \
+                                                   static_cast<int>(strlen(b)));
+#else
+#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, (int)strlen(b));
+#endif
+
+#endif /* STREAMHTMLPARSER_HTMLPARSER_H */
diff --git a/streamhtmlparser/htmlparser_cpp.h b/streamhtmlparser/htmlparser_cpp.h
new file mode 100644
index 0000000..3802233
--- /dev/null
+++ b/streamhtmlparser/htmlparser_cpp.h
@@ -0,0 +1,322 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Filipe Almeida
+//
+// c++ bindings for htmlparser.
+
+#ifndef STREAMHTMLPARSER_HTMLPARSER_CPP_H__
+#define STREAMHTMLPARSER_HTMLPARSER_CPP_H__
+
+#include <string>
+#include <assert.h>
+extern "C" {
+  #include "htmlparser.h"
+  #include "jsparser.h"
+}
+
+namespace streamhtmlparser {
+
+class JavascriptParser {
+  public:
+    enum State {
+      STATE_TEXT = JSPARSER_STATE_TEXT,
+      STATE_Q = JSPARSER_STATE_Q,
+      STATE_DQ = JSPARSER_STATE_DQ,
+      STATE_REGEXP = JSPARSER_STATE_REGEXP,
+      STATE_COMMENT = JSPARSER_STATE_COMMENT
+    };
+};
+
+class HtmlParser {
+  public:
+
+    /* html states */
+    enum State {
+      STATE_TEXT = HTMLPARSER_STATE_TEXT,
+      STATE_TAG = HTMLPARSER_STATE_TAG,
+      STATE_ATTR = HTMLPARSER_STATE_ATTR,
+      STATE_VALUE = HTMLPARSER_STATE_VALUE,
+      STATE_COMMENT = HTMLPARSER_STATE_COMMENT,
+      STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE,
+      STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE,
+      STATE_ERROR = HTMLPARSER_STATE_ERROR
+    };
+
+    /* attribute types */
+    enum AttributeType {
+      ATTR_NONE = HTMLPARSER_ATTR_NONE,
+      ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR,
+      ATTR_URI = HTMLPARSER_ATTR_URI,
+      ATTR_JS = HTMLPARSER_ATTR_JS,
+      ATTR_STYLE = HTMLPARSER_ATTR_STYLE
+    };
+
+    /* Parser modes */
+    enum Mode {
+      MODE_HTML = HTMLPARSER_MODE_HTML,
+      MODE_JS = HTMLPARSER_MODE_JS,
+      MODE_CSS = HTMLPARSER_MODE_CSS,
+      MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG
+    };
+
+    HtmlParser() {
+      parser_ = htmlparser_new();
+      assert(parser_ != NULL);
+    };
+
+    /* Parses the input html stream and returns the finishing state.
+     *
+     * Returns HtmlParser::STATE_ERROR if unable to parse the input. If
+     * htmlparser_parse() is called after an error situation was encountered
+     * the behaviour is unspecified. At this point, Reset() or ResetMode()
+     * can be called to reset the state so it can be used to parse a new file.
+     */
+    int Parse(const char *str, int len) {
+      return htmlparser_parse(parser_, str, len);
+    };
+
+    int Parse(const std::string &str) {
+      return Parse(str.c_str(), static_cast<int>(str.length()));
+    };
+
+    /* Returns the current state the parser is in */
+    int state() const {
+      return htmlparser_state(parser_);
+    };
+
+    /* Returns the current tag or NULL if not available.
+     *
+     * There is no stack implemented because we currently don't have a need for
+     * it, which means tag names are tracked only one level deep.
+     *
+     * This is better understood by looking at the following example:
+     *
+     * <b [tag=b]>
+     *   [tag=b]
+     *   <i>
+     *    [tag=i]
+     *   </i>
+     *  [tag=NULL]
+     * </b>
+     *
+     * The tag is correctly filled inside the tag itself and before any new
+     * inner tag is closed, at which point the tag will be set to NULL.
+     *
+     * For our current purposes this is not a problem, but we may implement a
+     * tag tracking stack in the future for completeness.
+     */
+    const char *tag() const {
+      return htmlparser_tag(parser_);
+    }
+
+    /* Returns the current attribute name if inside an attribute name or an
+     * attribute value. Returns NULL otherwise. */
+    const char *attribute() const {
+      return htmlparser_attr(parser_);
+    }
+
+    /* Returns the contents of the current attribute value. */
+    const char *value() const {
+      return htmlparser_value(parser_);
+    }
+
+    /* Returns true if inside javascript. This can be a javascript block, a
+     * javascript attribute value or the parser may just be in javascript mode
+     * (HtmlParser::MODE_JS) */
+    bool InJavascript() const {
+      return static_cast<bool>(htmlparser_in_js(parser_));
+    }
+
+    /* Returns true if the parser is currently inside a CSS construct.
+     *
+     * Currently this can be either a STYLE tag, a STYLE attribute or the fact
+     * that the parser was reset using MODE_CSS using ResetMode().
+     */
+    bool InCss() const {
+      return static_cast<bool>(htmlparser_in_css(parser_));
+    }
+
+    /* Returns true if the current attribute is quoted */
+    bool IsAttributeQuoted() const {
+      return static_cast<bool>(htmlparser_is_attr_quoted(parser_));
+    }
+
+    /* Returns true if the parser is inside a js string literal.
+     */
+    bool IsJavascriptQuoted() const {
+      return static_cast<bool>(htmlparser_is_js_quoted(parser_));
+    }
+
+    /* Returns the index within the current value or -1 if the parser is not
+     * inside an attribute value */
+    int ValueIndex() const {
+      return htmlparser_value_index(parser_);
+    }
+
+    /* Returns true if this is the first character of a url inside an attribute.
+     *
+     * This function can be used by an html sanitizer or auto escaping system as
+     * a hint that it should validate the url for a whitelist of protocol
+     * handlers and for well-formedness, or that it should just escape a
+     * component of it.
+     *
+     * For attributes that expect a url this will return true if we are at the
+     * first character of the attribute, but for the special case of a meta
+     * redirect tag some analysis is made in order to verify if we are at the
+     * start of a url or not.
+     *
+     * For any other attributes, the result will always be false.
+     *
+     */
+    bool IsUrlStart() const {
+      return htmlparser_is_url_start(parser_);
+    }
+
+    /* Returns the current attribute type.
+     *
+     * The attribute type can be one of:
+     *   ATTR_NONE - not inside an attribute
+     *   ATTR_REGULAR - Inside a normal attribute
+     *   ATTR_URI - Inside an attribute that accepts a uri
+     *   ATTR_JS - Inside a javascript attribute
+     *   ATTR_STYLE - Inside a css style attribute
+     * */
+    int AttributeType() const {
+      return htmlparser_attr_type(parser_);
+    }
+
+    /* Return the current line number. */
+    int line_number() const {
+      return htmlparser_get_line_number(parser_);
+    }
+
+    /* Set the current line number. */
+    void set_line_number(int line) {
+      return htmlparser_set_line_number(parser_, line);
+    }
+
+    /* Return the current column number. */
+    int column_number() const {
+      return htmlparser_get_column_number(parser_);
+    }
+
+    /* Set the current line number. */
+    void set_column_number(int column) {
+      return htmlparser_set_column_number(parser_, column);
+    }
+
+    /* Retrieve a human readable error message in case an error occurred.
+     *
+     * NULL is returned if the parser didn't encounter an error.
+     */
+    const char *GetErrorMessage() {
+      return htmlparser_get_error_msg(parser_);
+    }
+
+    /* Returns the current state the javascript parser is in.
+     *
+     * Should only be used for testing.
+     */
+    int javascript_state() const {
+      return htmlparser_js_state(parser_);
+    };
+
+    /* Resets the parser to it's initial state and changes the parser mode.
+     *
+     * Internal state (tag name, attribute name, state of statemachine) is
+     * reset as * though the object was just created.
+     *
+     * Available modes:
+     *  MODE_HTML - Parses html text
+     *  MODE_JS - Parses javascript files
+     *  MODE_CSS - Parses CSS files. No actual parsing is actually done
+     *             but InCss() always returns true.
+     *  MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
+     *                     be used in a template expanded in the
+     *                     following context: <a $template>
+     */
+    void ResetMode(enum Mode mode) {
+      return htmlparser_reset_mode(parser_, mode);
+    }
+
+    /* Resets the parser to it's initial state and to the default mode, which is
+     * MODE_HTML.
+     *
+     * All internal context like tag name, attribute name or the state of the
+     * statemachine are reset to it's original values as if the object was just
+     * created.
+     */
+    void Reset() {
+      return htmlparser_reset(parser_);
+    }
+
+    /* Invoked when text is inserted by the caller.
+     *
+     * Should be called before a template directive that expands to content is
+     * found. This changes the current state by following the default rule,
+     * ensuring we stay in sync with template.
+     *
+     * Returns true if template directives are accepted for this state and
+     * false if they are not, which should result in an error condition.
+     *
+     * Right now the only case being handled are unquoted attribute values and
+     * it always returns true. In the future we can handle more cases and
+     * restrict the states were we allow template directives by returning false
+     * for those.
+     */
+    bool InsertText() {
+      return static_cast<bool>(htmlparser_insert_text(parser_));
+    }
+
+    /* Copies the context of the HtmlParser object referenced in source to the
+     * current object.
+     */
+    void CopyFrom(const HtmlParser *source) {
+      assert(this != source);
+      assert(source != NULL);
+      htmlparser_copy(parser_, source->parser_);
+    }
+
+    ~HtmlParser() {
+      htmlparser_delete(parser_);
+    };
+
+
+  private:
+    htmlparser_ctx *parser_;
+    HtmlParser(const HtmlParser&);      // disallow copy
+    void operator=(const HtmlParser&);  // and assign
+
+};
+
+}
+
+#endif  // STREAMHTMLPARSER_HTMLPARSER_CPP_H__
diff --git a/streamhtmlparser/include/htmlparser.h b/streamhtmlparser/include/htmlparser.h
deleted file mode 100644
index 58db4a5..0000000
--- a/streamhtmlparser/include/htmlparser.h
+++ /dev/null
@@ -1,397 +0,0 @@
-/* Copyright (c) 2007, Google Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *     * Neither the name of Google Inc. nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ---
- * Author: Filipe Almeida
- */
-
-#ifndef STREAMHTMLPARSER_HTMLPARSER_H
-#define STREAMHTMLPARSER_HTMLPARSER_H
-
-#include "statemachine.h"
-#include "jsparser.h"
-
-/* entity filter */
-
-/* String sizes used in htmlparser and entityfilter structures including the
- * NULL terminator.
- */
-#define HTMLPARSER_MAX_STRING STATEMACHINE_RECORD_BUFFER_SIZE
-#define HTMLPARSER_MAX_ENTITY_SIZE 10
-
-
-enum htmlparser_state_external_enum {
-    HTMLPARSER_STATE_TEXT,
-    HTMLPARSER_STATE_TAG,
-    HTMLPARSER_STATE_ATTR,
-    HTMLPARSER_STATE_VALUE,
-    HTMLPARSER_STATE_COMMENT,
-    HTMLPARSER_STATE_JS_FILE,
-    HTMLPARSER_STATE_CSS_FILE,
-    HTMLPARSER_STATE_ERROR
-};
-
-enum htmlparser_mode {
-    HTMLPARSER_MODE_HTML,
-    HTMLPARSER_MODE_JS,
-    HTMLPARSER_MODE_CSS,
-    HTMLPARSER_MODE_HTML_IN_TAG
-};
-
-enum htmlparser_attr_type {
-    HTMLPARSER_ATTR_NONE,
-    HTMLPARSER_ATTR_REGULAR,
-    HTMLPARSER_ATTR_URI,
-    HTMLPARSER_ATTR_JS,
-    HTMLPARSER_ATTR_STYLE
-};
-
-
-/* TODO(falmeida): Maybe move some of these declaration to the .c and only keep
- * a forward declaration in here, since these structures are private.
- */
-
-/* entityfilter context structure.
- *
- * The entity filter collection of routines provide a way to decode html
- * entities from an html document in a streaming way.
- *
- * The html_process() function receives a character at a time from the input
- * stream and returns 0 or more characters which should be appended to the
- * resulting decoded document.
- *
- * Currently this collection of functions are only exported for testing purposes
- * and shouldn't be called from outside of htmlparser.c.
- *
- * Since we really only use these functions with the very specific purpose of
- * decoding html entities for javascript attributes, only a small subset of
- * entities are supported: &lt;, &gt;, &quote;, &amp;, &apos, and the numeric
- * character references for both decimal and hexadecimal.
- */
-typedef struct entityfilter_ctx_s {
-
-    /* Current position into the buffer. */
-    int buffer_pos;
-
-    /* True if currently processing an html entity. */
-    int in_entity;
-
-    /* Temporary character buffer that is used while processing html entities.
-     */
-    char buffer[HTMLPARSER_MAX_ENTITY_SIZE];
-
-    /* String buffer returned to the application after we decoded an html
-     * entity.
-     */
-    char output[HTMLPARSER_MAX_ENTITY_SIZE];
-} entityfilter_ctx;
-
-/* Resets the entityfilter to its initial state so it can be reused.
- */
-void entityfilter_reset(entityfilter_ctx *ctx);
-
-/* Initializes a new entity filter object.
- */
-entityfilter_ctx *entityfilter_new(void);
-
-/* Deallocates an entity filter object.
- */
-void entityfilter_delete(entityfilter_ctx *ctx);
-
-/* Copies the context of the entityfilter pointed to by src to the entityfilter
- * dst.
- */
-void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src);
-
-/* Processes a character from the input stream and decodes any html entities
- * in the accumulated buffer.
- *
- * Returns a reference to a string that points to an internal buffer. This
- * buffer will be changed after every call to entityfilter_process(). As
- * such this string should be duplicated before subsequent calls to
- * entityfilter_process().
- */
-const char *entityfilter_process(entityfilter_ctx *ctx, char c);
-
-
-/* html parser */
-
-/* Stores the context of the html parser.
- * If this structure is changed, htmlparser_new(), htmlparser_copy() and
- * htmlparser_reset() should be updated accordingly.
- */
-typedef struct htmlparser_ctx_s {
-
-  /* Holds a reference to the statemachine context. */
-  statemachine_ctx *statemachine;
-
-  /* Holds a reference to the statemachine definition in use. Right now this is
-   * only used so we can deallocate it at the end.
-   *
-   * It should be readonly and contain the same values across jsparser
-   * instances.
-   */
-  /* TODO(falmeida): Change statemachine_def to const. */
-  statemachine_definition *statemachine_def;
-
-  /* Holds a reference to the javascript parser. */
-  jsparser_ctx *jsparser;
-
-  /* Holds a reference to the entity filter. Used for decoding html entities
-   * inside javascript attributes. */
-  entityfilter_ctx *entityfilter;
-
-  /* Offset into the current attribute value where 0 is the first character in
-   * the value. */
-  int value_index;
-
-  /* True if currently processing javascript. */
-  int in_js;
-
-  /* Current tag name. */
-  char tag[HTMLPARSER_MAX_STRING];
-
-  /* Current attribute name. */
-  char attr[HTMLPARSER_MAX_STRING];
-
-  /* Contents of the current value capped to HTMLPARSER_MAX_STRING. */
-  char value[HTMLPARSER_MAX_STRING];
-
-} htmlparser_ctx;
-
-/* Resets the parser to its initial state and to the default mode, which
- * is MODE_HTML.
- *
- * All internal context like tag name, attribute name or the state of the
- * statemachine are reset to its original values as if the object was just
- * created.
- */
-void htmlparser_reset(htmlparser_ctx *ctx);
-
-/* Resets the parser to its initial state and changes the parser mode.
- * All internal context like tag name, attribute name or the state of the
- * statemachine are reset to their original values as if the object was just
- * created.
- *
- * Available modes:
- *  HTMLPARSER_MODE_HTML - Parses html text
- *  HTMLPARSER_MODE_JS - Parses javascript files
- *  HTMLPARSER_MODE_CSS - Parses CSS files. No actual parsing is actually done
- *                        but htmlparser_in_css() always returns true.
- *  HTMLPARSER_MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
- *                                be used in a template expanded in the
- *                                following context: <a $template>
- *
- */
-void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode);
-
-/* Initializes a new htmlparser instance.
- *
- * Returns a pointer to the new instance or NULL if the initialization fails.
- * Initialization failure is fatal, and if this function fails it may not
- * deallocate all previsouly allocated memory.
- */
-htmlparser_ctx *htmlparser_new(void);
-
-/* Copies the context of the htmlparser pointed to by src to the htmlparser dst.
- *
- * Also copies over the instances of the state machine, the jsparser and the
- * entity filter but not the statemachine definition since this one is read
- * only.
- */
-void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src);
-
-/* Receives an htmlparser context and returns the current html state.
- *
- * The return value will be one of the states of htmlparser_state_external_enum.
- */
-int htmlparser_state(htmlparser_ctx *ctx);
-
-/* Parses the input html stream and returns the finishing state.
- *
- * Returns HTMLPARSER_ERROR if unable to parse the input. If htmlparser_parse()
- * is called after an error situation was encountered the behaviour is
- * unspecified. At this point, htmlparser_reset() or htmlparser_reset_mode()
- * can be called to reset the state.
- */
-int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size);
-
-/* Returns true if the parser is inside an attribute value and the value is
- * surrounded by single or double quotes. */
-int htmlparser_is_attr_quoted(htmlparser_ctx *ctx);
-
-/* Returns true if the parser is currently in javascript. This can be a
- * an attribute that takes javascript, a javascript block or the parser
- * can just be in MODE_JS. */
-int htmlparser_in_js(htmlparser_ctx *ctx);
-
-/* Returns the current tag or NULL if not available or we haven't seen the
- * entire tag yet.
- *
- * There is no stack implemented because we currently don't have a need for
- * it, which means tag names are tracked only one level deep.
- *
- * This is better understood by looking at the following example:
- *
- * <b [tag=b]>
- *   [tag=b]
- *   <i>
- *    [tag=i]
- *   </i>
- *  [tag=NULL]
- * </b>
- *
- * The tag is correctly filled inside the tag itself and before any new inner
- * tag is closed, at which point the tag will be null.
- *
- * For our current purposes this is not a problem, but we may implement a tag
- * tracking stack in the future for completeness.
- *
- */
-const char *htmlparser_tag(htmlparser_ctx *ctx);
-
-/* Returns the current attribute name if after an attribute name or in an
- * attribute value. Returns NULL otherwise. */
-const char *htmlparser_attr(htmlparser_ctx *ctx);
-
-/* Returns the contents of the current attribute value.
- *
- * Returns NULL if not inside an attribute value.
- */
-const char *htmlparser_value(htmlparser_ctx *ctx);
-
-/* Returns true if the parser is currently inside a CSS construct.
- *
- * Currently this can be either a STYLE tag, a STYLE attribute or the fact that
- * the parser was reset in HTMLPARSER_MODE_CSS using
- * htmlparser_reset_mode().
- */
-int htmlparser_in_css(htmlparser_ctx *ctx);
-
-/* Returns the current state of the javascript state machine.
- *
- * Currently only present for testing purposes.
- */
-int htmlparser_js_state(htmlparser_ctx *ctx);
-
-/* Returns non-zero if currently inside a javascript string literal and zero
- * otherwise.
- */
-int htmlparser_is_js_quoted(htmlparser_ctx *ctx);
-
-/* Returns non-zero if currently inside an attribute value and zero otherwise.
- */
-int htmlparser_value_index(htmlparser_ctx *ctx);
-
-/* Returns true if this is the first character of a url inside an attribute.
- *
- * This function can be used by an html sanitizer or auto escaping system as a
- * hint that it should validate the url for a whitelist of protocol handlers and
- * for well-formedness, or that it should just escape a component of it.
- *
- * For attributes that expect a URL, this will return true if we are at the
- * first character of the URL, false otherwise.
- * For most attributes, this is the same as checking that we are at the first
- * character of the attribute value but it also works correctly for the
- * "content" attribute of the "meta" tag where the URL follows some earlier
- * content.
- * e.g: <meta http-equiv="refresh" * content="5; URL=http://bla.">
- *
- * For any other attributes, the result will always be false.
- */
-int htmlparser_is_url_start(htmlparser_ctx *ctx);
-
-/* Returns the current attribute type.
- *
- * The attribute type can be one of:
- *   HTMLPARSER_ATTR_NONE - not inside an attribute.
- *   HTMLPARSER_ATTR_REGULAR - Inside a normal attribute.
- *   HTMLPARSER_ATTR_URI - Inside an attribute that accepts a uri.
- *   HTMLPARSER_ATTR_JS - Inside a javascript attribute.
- *   HTMLPARSER_ATTR_STYLE - Inside a css style attribute.
- */
-int htmlparser_attr_type(htmlparser_ctx *ctx);
-
-/* Return the current line number. */
-int htmlparser_get_line_number(htmlparser_ctx *ctx);
-
-/* Set the current line number. */
-void htmlparser_set_line_number(htmlparser_ctx *ctx, int line);
-
-/* Return the current column number. */
-int htmlparser_get_column_number(htmlparser_ctx *ctx);
-
-/* Set the current column number. */
-void htmlparser_set_column_number(htmlparser_ctx *ctx, int column);
-
-/* Retrieve a human readable error message in case an error occurred.
- *
- * NULL is returned if the parser didn't encounter an error.
- */
-const char *htmlparser_get_error_msg(htmlparser_ctx *ctx);
-
-/* Invoked by the caller when text is expanded by the caller.
- *
- * Should be invoked when a template directive that expands to content is
- * executed but we don't provide this content to the parser itself. This changes
- * the current state by following the default rule, ensuring we stay in sync
- * with the template.
- *
- * Returns 1 if template directives are accepted for this state and 0 if they
- * are not, which should result in an error condition.
- *
- * Right now the only case being handled are unquoted attribute values and it
- * always returns 1. When insert_text() is called after the equals sign, we
- * assume some text was consumed and we are now in the middle of the attribute
- * value itself. Example:
- *
- * <a href=$HREF_VALUE alt=alternate_text>
- *
- * The template calls insert_text() when it encounters $HREF_VALUE. If it didn't
- * the parser would only have seen the following html:
- *
- * <a href= alt=alternate_text>
- *
- * and would interpret alt=alternate_text as the value of the href attribute.
- */
-int htmlparser_insert_text(htmlparser_ctx *ctx);
-
-/* Deallocates an htmlparser context object.
- */
-void htmlparser_delete(htmlparser_ctx *ctx);
-
-#define htmlparser_parse_chr(a,b) htmlparser_parse(a, &(b), 1);
-#ifdef __cplusplus
-#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, \
-                                                   static_cast<int>(strlen(b)));
-#else
-#define htmlparser_parse_str(a,b) htmlparser_parse(a, b, (int)strlen(b));
-#endif
-
-#endif /* STREAMHTMLPARSER_HTMLPARSER_H */
diff --git a/streamhtmlparser/include/htmlparser_cpp.h b/streamhtmlparser/include/htmlparser_cpp.h
deleted file mode 100644
index 3802233..0000000
--- a/streamhtmlparser/include/htmlparser_cpp.h
+++ /dev/null
@@ -1,322 +0,0 @@
-// Copyright (c) 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// ---
-// Author: Filipe Almeida
-//
-// c++ bindings for htmlparser.
-
-#ifndef STREAMHTMLPARSER_HTMLPARSER_CPP_H__
-#define STREAMHTMLPARSER_HTMLPARSER_CPP_H__
-
-#include <string>
-#include <assert.h>
-extern "C" {
-  #include "htmlparser.h"
-  #include "jsparser.h"
-}
-
-namespace streamhtmlparser {
-
-class JavascriptParser {
-  public:
-    enum State {
-      STATE_TEXT = JSPARSER_STATE_TEXT,
-      STATE_Q = JSPARSER_STATE_Q,
-      STATE_DQ = JSPARSER_STATE_DQ,
-      STATE_REGEXP = JSPARSER_STATE_REGEXP,
-      STATE_COMMENT = JSPARSER_STATE_COMMENT
-    };
-};
-
-class HtmlParser {
-  public:
-
-    /* html states */
-    enum State {
-      STATE_TEXT = HTMLPARSER_STATE_TEXT,
-      STATE_TAG = HTMLPARSER_STATE_TAG,
-      STATE_ATTR = HTMLPARSER_STATE_ATTR,
-      STATE_VALUE = HTMLPARSER_STATE_VALUE,
-      STATE_COMMENT = HTMLPARSER_STATE_COMMENT,
-      STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE,
-      STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE,
-      STATE_ERROR = HTMLPARSER_STATE_ERROR
-    };
-
-    /* attribute types */
-    enum AttributeType {
-      ATTR_NONE = HTMLPARSER_ATTR_NONE,
-      ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR,
-      ATTR_URI = HTMLPARSER_ATTR_URI,
-      ATTR_JS = HTMLPARSER_ATTR_JS,
-      ATTR_STYLE = HTMLPARSER_ATTR_STYLE
-    };
-
-    /* Parser modes */
-    enum Mode {
-      MODE_HTML = HTMLPARSER_MODE_HTML,
-      MODE_JS = HTMLPARSER_MODE_JS,
-      MODE_CSS = HTMLPARSER_MODE_CSS,
-      MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG
-    };
-
-    HtmlParser() {
-      parser_ = htmlparser_new();
-      assert(parser_ != NULL);
-    };
-
-    /* Parses the input html stream and returns the finishing state.
-     *
-     * Returns HtmlParser::STATE_ERROR if unable to parse the input. If
-     * htmlparser_parse() is called after an error situation was encountered
-     * the behaviour is unspecified. At this point, Reset() or ResetMode()
-     * can be called to reset the state so it can be used to parse a new file.
-     */
-    int Parse(const char *str, int len) {
-      return htmlparser_parse(parser_, str, len);
-    };
-
-    int Parse(const std::string &str) {
-      return Parse(str.c_str(), static_cast<int>(str.length()));
-    };
-
-    /* Returns the current state the parser is in */
-    int state() const {
-      return htmlparser_state(parser_);
-    };
-
-    /* Returns the current tag or NULL if not available.
-     *
-     * There is no stack implemented because we currently don't have a need for
-     * it, which means tag names are tracked only one level deep.
-     *
-     * This is better understood by looking at the following example:
-     *
-     * <b [tag=b]>
-     *   [tag=b]
-     *   <i>
-     *    [tag=i]
-     *   </i>
-     *  [tag=NULL]
-     * </b>
-     *
-     * The tag is correctly filled inside the tag itself and before any new
-     * inner tag is closed, at which point the tag will be set to NULL.
-     *
-     * For our current purposes this is not a problem, but we may implement a
-     * tag tracking stack in the future for completeness.
-     */
-    const char *tag() const {
-      return htmlparser_tag(parser_);
-    }
-
-    /* Returns the current attribute name if inside an attribute name or an
-     * attribute value. Returns NULL otherwise. */
-    const char *attribute() const {
-      return htmlparser_attr(parser_);
-    }
-
-    /* Returns the contents of the current attribute value. */
-    const char *value() const {
-      return htmlparser_value(parser_);
-    }
-
-    /* Returns true if inside javascript. This can be a javascript block, a
-     * javascript attribute value or the parser may just be in javascript mode
-     * (HtmlParser::MODE_JS) */
-    bool InJavascript() const {
-      return static_cast<bool>(htmlparser_in_js(parser_));
-    }
-
-    /* Returns true if the parser is currently inside a CSS construct.
-     *
-     * Currently this can be either a STYLE tag, a STYLE attribute or the fact
-     * that the parser was reset using MODE_CSS using ResetMode().
-     */
-    bool InCss() const {
-      return static_cast<bool>(htmlparser_in_css(parser_));
-    }
-
-    /* Returns true if the current attribute is quoted */
-    bool IsAttributeQuoted() const {
-      return static_cast<bool>(htmlparser_is_attr_quoted(parser_));
-    }
-
-    /* Returns true if the parser is inside a js string literal.
-     */
-    bool IsJavascriptQuoted() const {
-      return static_cast<bool>(htmlparser_is_js_quoted(parser_));
-    }
-
-    /* Returns the index within the current value or -1 if the parser is not
-     * inside an attribute value */
-    int ValueIndex() const {
-      return htmlparser_value_index(parser_);
-    }
-
-    /* Returns true if this is the first character of a url inside an attribute.
-     *
-     * This function can be used by an html sanitizer or auto escaping system as
-     * a hint that it should validate the url for a whitelist of protocol
-     * handlers and for well-formedness, or that it should just escape a
-     * component of it.
-     *
-     * For attributes that expect a url this will return true if we are at the
-     * first character of the attribute, but for the special case of a meta
-     * redirect tag some analysis is made in order to verify if we are at the
-     * start of a url or not.
-     *
-     * For any other attributes, the result will always be false.
-     *
-     */
-    bool IsUrlStart() const {
-      return htmlparser_is_url_start(parser_);
-    }
-
-    /* Returns the current attribute type.
-     *
-     * The attribute type can be one of:
-     *   ATTR_NONE - not inside an attribute
-     *   ATTR_REGULAR - Inside a normal attribute
-     *   ATTR_URI - Inside an attribute that accepts a uri
-     *   ATTR_JS - Inside a javascript attribute
-     *   ATTR_STYLE - Inside a css style attribute
-     * */
-    int AttributeType() const {
-      return htmlparser_attr_type(parser_);
-    }
-
-    /* Return the current line number. */
-    int line_number() const {
-      return htmlparser_get_line_number(parser_);
-    }
-
-    /* Set the current line number. */
-    void set_line_number(int line) {
-      return htmlparser_set_line_number(parser_, line);
-    }
-
-    /* Return the current column number. */
-    int column_number() const {
-      return htmlparser_get_column_number(parser_);
-    }
-
-    /* Set the current line number. */
-    void set_column_number(int column) {
-      return htmlparser_set_column_number(parser_, column);
-    }
-
-    /* Retrieve a human readable error message in case an error occurred.
-     *
-     * NULL is returned if the parser didn't encounter an error.
-     */
-    const char *GetErrorMessage() {
-      return htmlparser_get_error_msg(parser_);
-    }
-
-    /* Returns the current state the javascript parser is in.
-     *
-     * Should only be used for testing.
-     */
-    int javascript_state() const {
-      return htmlparser_js_state(parser_);
-    };
-
-    /* Resets the parser to it's initial state and changes the parser mode.
-     *
-     * Internal state (tag name, attribute name, state of statemachine) is
-     * reset as * though the object was just created.
-     *
-     * Available modes:
-     *  MODE_HTML - Parses html text
-     *  MODE_JS - Parses javascript files
-     *  MODE_CSS - Parses CSS files. No actual parsing is actually done
-     *             but InCss() always returns true.
-     *  MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
-     *                     be used in a template expanded in the
-     *                     following context: <a $template>
-     */
-    void ResetMode(enum Mode mode) {
-      return htmlparser_reset_mode(parser_, mode);
-    }
-
-    /* Resets the parser to it's initial state and to the default mode, which is
-     * MODE_HTML.
-     *
-     * All internal context like tag name, attribute name or the state of the
-     * statemachine are reset to it's original values as if the object was just
-     * created.
-     */
-    void Reset() {
-      return htmlparser_reset(parser_);
-    }
-
-    /* Invoked when text is inserted by the caller.
-     *
-     * Should be called before a template directive that expands to content is
-     * found. This changes the current state by following the default rule,
-     * ensuring we stay in sync with template.
-     *
-     * Returns true if template directives are accepted for this state and
-     * false if they are not, which should result in an error condition.
-     *
-     * Right now the only case being handled are unquoted attribute values and
-     * it always returns true. In the future we can handle more cases and
-     * restrict the states were we allow template directives by returning false
-     * for those.
-     */
-    bool InsertText() {
-      return static_cast<bool>(htmlparser_insert_text(parser_));
-    }
-
-    /* Copies the context of the HtmlParser object referenced in source to the
-     * current object.
-     */
-    void CopyFrom(const HtmlParser *source) {
-      assert(this != source);
-      assert(source != NULL);
-      htmlparser_copy(parser_, source->parser_);
-    }
-
-    ~HtmlParser() {
-      htmlparser_delete(parser_);
-    };
-
-
-  private:
-    htmlparser_ctx *parser_;
-    HtmlParser(const HtmlParser&);      // disallow copy
-    void operator=(const HtmlParser&);  // and assign
-
-};
-
-}
-
-#endif  // STREAMHTMLPARSER_HTMLPARSER_CPP_H__
diff --git a/streamhtmlparser/include/jsparser.h b/streamhtmlparser/include/jsparser.h
deleted file mode 100644
index 4077aa4..0000000
--- a/streamhtmlparser/include/jsparser.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2007, Google Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *     * Neither the name of Google Inc. nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ---
- * Author: Filipe Almeida
- */
-
-#ifndef STREAMHTMLPARSER_JSPARSER_H
-#define STREAMHTMLPARSER_JSPARSER_H
-
-#include "statemachine.h"
-
-/* Size of the ring buffer used to lookup the last token in the javascript
- * stream. The size is pretty much arbitrary at this point but must be bigger
- * than the biggest token we want to lookup plus 3: Two delimiters plus an empty
- * ring buffer slot. */
-#define JSPARSER_RING_BUFFER_SIZE 18
-
-enum js_state_external_enum {
-    JSPARSER_STATE_TEXT,
-    JSPARSER_STATE_Q,
-    JSPARSER_STATE_DQ,
-    JSPARSER_STATE_REGEXP,
-    JSPARSER_STATE_COMMENT
-};
-
-/* Stores the context of the javascript parser.
- *
- * If this structure is changed, jsparser_new(), jsparser_copy() and
- * jsparser_reset() should be updated accordingly.
- */
-typedef struct jsparser_ctx_s {
-
-  /* Reference to the statemachine context. */
-  statemachine_ctx *statemachine;
-
-  /* Reference to the statemachine definition.
-   *
-   * It should be readonly and contain the same values across jsparser
-   * instances.
-   */
-  /* TODO(falmeida): Change statemachine_def to const. */
-  statemachine_definition *statemachine_def;
-
-  /* Index to the start of the buffer. */
-  int buffer_start;
-
-  /* Index the current writing position (end of the buffer plus one). */
-  int buffer_end;
-
-  /* Ring buffer used to lookup the last token. */
-  char buffer[JSPARSER_RING_BUFFER_SIZE];
-
-} jsparser_ctx;
-
-
-void jsparser_reset(jsparser_ctx *ctx);
-jsparser_ctx *jsparser_new(void);
-
-/* Returns a pointer to a context which is a duplicate of the jsparser src.
- */
-jsparser_ctx *jsparser_duplicate(jsparser_ctx *src);
-
-/* Copies the context of the jsparser pointed to by src to the jsparser dst.
- */
-void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src);
-int jsparser_state(jsparser_ctx *ctx);
-int jsparser_parse(jsparser_ctx *ctx, const char *str, int size);
-
-void jsparser_delete(jsparser_ctx *ctx);
-
-/**
- * Ring buffer functions.
- *
- * These functions are only exported for testing and should not be called from
- * outside of jsparser.c in production code.
- */
-
-/* Appends a character to the ring buffer.
- *
- * Sequences of whitespaces and newlines are folded into one character.
- */
-void jsparser_buffer_append_chr(jsparser_ctx *js, char chr);
-
-/* Appends a string to the ring buffer.
- *
- * Sequences of whitespaces and newlines are folded into one character.
- */
-void jsparser_buffer_append_str(jsparser_ctx *js, const char *str);
-
-/* Returns the last appended character and removes it from the buffer. If the
- * buffer is empty, then it returns ASCII 0 ('\0').
- */
-char jsparser_buffer_pop(jsparser_ctx *js);
-
-/* Returns the value of the character at a certain index in the buffer or an
- * ASCII 0 ('\0') character if the index is extends beyond the size of the
- * buffer, either because we don't have as many characters in the buffer, or
- * because the index points to a place bigger than the size of the buffer..
- *
- * Index positions must be negative, where -1 is the last character appended to
- * the buffer.
- */
-char jsparser_buffer_get(jsparser_ctx *js, int pos);
-
-/* Sets the value of the character at a certain index in the buffer. Returns
- * true if the write was successful or false if there was an attempt to write
- * outside of the buffer boundaries.
- *
- * Index positions are negative, were -1 is the last character appended to the
- * buffer. Using positive integers for the index will result in undefined
- * behaviour.
- */
-int jsparser_buffer_set(jsparser_ctx *js, int pos, char value);
-
-/* Copies a slice of the buffer to the string pointed to by output. start and
- * end are the indexes of the sliced region. If the start argument extends
- * beyond the beginning of the buffer, the slice will only contain characters
- * starting from beginning of the buffer.
- */
-void jsparser_buffer_slice(jsparser_ctx *js, char *buffer, int start, int end);
-
-/* Copy the last javascript identifier or keyword found in the buffer to the
- * string pointed by identifier.
- */
-int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier);
-
-
-#define jsparser_parse_chr(a,b) jsparser_parse(a, &(b), 1);
-#ifdef __cplusplus
-#define jsparser_parse_str(a,b) jsparser_parse(a, b, \
-                                               static_cast<int>(strlen(b)));
-#else
-#define jsparser_parse_str(a,b) jsparser_parse(a, b, (int)strlen(b));
-#endif
-
-#endif /* STREAMHTMLPARSER_JSPARSER_H */
diff --git a/streamhtmlparser/include/statemachine.h b/streamhtmlparser/include/statemachine.h
deleted file mode 100644
index a05ffe7..0000000
--- a/streamhtmlparser/include/statemachine.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2007, Google Inc.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *     * Neither the name of Google Inc. nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ---
- * Author: Filipe Almeida
- */
-
-#ifndef STREAMHTMLPARSER_STATEMACHINE_H
-#define STREAMHTMLPARSER_STATEMACHINE_H
-
-/* TODO(falmeida): I'm not sure about these limits, but since right now we only
- * have 24 states it should be fine */
-
-enum {
-    STATEMACHINE_ERROR = 127
-};
-
-#define STATEMACHINE_RECORD_BUFFER_SIZE 256
-
-#define STATEMACHINE_MAX_STR_ERROR 80
-
-struct statemachine_ctx_s;
-
-typedef void(*state_event_function)(struct statemachine_ctx_s *, int, char,
-                                    int);
-
-typedef struct statemachine_definition_s {
-    int num_states;
-    const int* const* transition_table;
-
-    /* Array containing the name of the states as a C string.
-     * This field is optional and if not in use it should be set to NULL.
-     */
-    const char* const* state_names;
-    state_event_function *in_state_events;
-    state_event_function *enter_state_events;
-    state_event_function *exit_state_events;
-} statemachine_definition;
-
-typedef struct statemachine_ctx_s {
-    int current_state;
-    int next_state;
-    statemachine_definition *definition;
-    char current_char;
-
-    /* Current line number. */
-    int line_number;
-
-    /* Current column number. */
-    int column_number;
-    char record_buffer[STATEMACHINE_RECORD_BUFFER_SIZE];
-    size_t record_pos;
-
-    /* True if we are recording the stream to record_buffer. */
-    int recording;
-
-    /* In case there was an error (we are in state STATEMACHINE_ERROR), it will
-     * contain a human readable description of the error.
-     */
-    char error_msg[STATEMACHINE_MAX_STR_ERROR];
-
-    /* Storage space for the layer above. */
-    void *user;
-} statemachine_ctx;
-
-/* Populates the statemachine definition.
- *
- * Receives a transition table and an optional array of state names. It uses
- * this data to populate the state machine definition.
- *
- * The transition table structure is a list of lists of ints (int **). The
- * outer list indexes the source state and the inner list contains the
- * destination state for each of the possible input characters:
- *
- * const int* const* transitions[source][input] == destination.
- *
- * The optional argument state_names points to a list of strings containing
- * human readable state names. These strings are used when reporting error
- * messages.
- */
-void statemachine_definition_populate(statemachine_definition *def,
-                                     const int* const* transition_table,
-                                     const char* const* state_names);
-
-void statemachine_in_state(statemachine_definition *def, int st,
-                           state_event_function func);
-void statemachine_enter_state(statemachine_definition *def, int st,
-                                     state_event_function func);
-void statemachine_exit_state(statemachine_definition *def, int st,
-                                    state_event_function func);
-
-statemachine_definition *statemachine_definition_new(int states);
-void statemachine_definition_delete(statemachine_definition *def);
-
-int statemachine_get_state(statemachine_ctx *ctx);
-void statemachine_set_state(statemachine_ctx *ctx, int state);
-
-void statemachine_start_record(statemachine_ctx *ctx);
-const char *statemachine_stop_record(statemachine_ctx *ctx);
-const char *statemachine_record_buffer(statemachine_ctx *ctx);
-
-/* Returns the the number of characters currently stored in the record buffer.
- */
-static inline size_t statemachine_record_length(statemachine_ctx *ctx) {
-  return ctx->record_pos + 1;
-}
-
-/* Return the current line number. */
-static inline int statemachine_get_line_number(statemachine_ctx *ctx) {
-  return ctx->line_number;
-}
-
-/* Set the current line number. */
-static inline void statemachine_set_line_number(statemachine_ctx *ctx,
-                                                int line) {
-  ctx->line_number = line;
-}
-
-/* Return the current column number. */
-static inline int statemachine_get_column_number(statemachine_ctx *ctx) {
-  return ctx->column_number;
-}
-
-/* Set the current column number. */
-static inline void statemachine_set_column_number(statemachine_ctx *ctx,
-                                                  int column) {
-  ctx->column_number = column;
-}
-
-
-/* Retrieve a human readable error message in case an error occurred.
- *
- * NULL is returned if the parser didn't encounter an error.
- */
-static inline const char *statemachine_get_error_msg(statemachine_ctx *ctx) {
-  if (ctx->next_state == STATEMACHINE_ERROR) {
-    return ctx->error_msg;
-  } else {
-    return NULL;
-  }
-}
-
-/* Reset the statemachine.
- *
- * The state is set to the initialization values. This includes setting the
- * state to the default state (0), stopping recording and setting the line
- * number to 1.
- */
-void statemachine_reset(statemachine_ctx *ctx);
-
-/* Initializes a new statemachine. Receives a statemachine definition object
- * that should have been initialized with statemachine_definition_new() and a
- * user reference to be used by the caller.
- *
- * Returns NULL if initialization fails.
- *
- * Initialization failure is fatal, and if this function fails it may not
- * deallocate all previsouly allocated memory.
- */
-statemachine_ctx *statemachine_new(statemachine_definition *def,
-                                   void *user);
-
-/* Returns a pointer to a context which is a duplicate of the statemachine src.
- * The statemachine definition and the user pointer have to be provided since
- * these references are not owned by the statemachine itself.
- */
-statemachine_ctx *statemachine_duplicate(statemachine_ctx *ctx,
-                                         statemachine_definition *def,
-                                         void *user);
-
-/* Copies the context of the statemachine pointed to by src to the statemachine
- * provided by dst.
- * The statemachine definition and the user pointer have to be provided since
- * these references are not owned by the statemachine itself.
- */
-void statemachine_copy(statemachine_ctx *dst,
-                       statemachine_ctx *src,
-                       statemachine_definition *def,
-                       void *user);
-
-int statemachine_parse(statemachine_ctx *ctx, const char *str, int size);
-
-void statemachine_delete(statemachine_ctx *ctx);
-
-
-/*****
- * The following functions are only exported for testing purposes and should
- * be treated as private. */
-
-
-/* Encode the character as an escaped C string.
- *
- * Encode the character chr into the string output. Writes at most len
- * characters to the output string but makes sure output is NULL terminated.
- */
-void statemachine_encode_char(char chr, char *output, size_t len);
-
-#endif /* STREAMHTMLPARSER_STATEMACHINE_H */
diff --git a/streamhtmlparser/jsparser.h b/streamhtmlparser/jsparser.h
new file mode 100644
index 0000000..4077aa4
--- /dev/null
+++ b/streamhtmlparser/jsparser.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Filipe Almeida
+ */
+
+#ifndef STREAMHTMLPARSER_JSPARSER_H
+#define STREAMHTMLPARSER_JSPARSER_H
+
+#include "statemachine.h"
+
+/* Size of the ring buffer used to lookup the last token in the javascript
+ * stream. The size is pretty much arbitrary at this point but must be bigger
+ * than the biggest token we want to lookup plus 3: Two delimiters plus an empty
+ * ring buffer slot. */
+#define JSPARSER_RING_BUFFER_SIZE 18
+
+enum js_state_external_enum {
+    JSPARSER_STATE_TEXT,
+    JSPARSER_STATE_Q,
+    JSPARSER_STATE_DQ,
+    JSPARSER_STATE_REGEXP,
+    JSPARSER_STATE_COMMENT
+};
+
+/* Stores the context of the javascript parser.
+ *
+ * If this structure is changed, jsparser_new(), jsparser_copy() and
+ * jsparser_reset() should be updated accordingly.
+ */
+typedef struct jsparser_ctx_s {
+
+  /* Reference to the statemachine context. */
+  statemachine_ctx *statemachine;
+
+  /* Reference to the statemachine definition.
+   *
+   * It should be readonly and contain the same values across jsparser
+   * instances.
+   */
+  /* TODO(falmeida): Change statemachine_def to const. */
+  statemachine_definition *statemachine_def;
+
+  /* Index to the start of the buffer. */
+  int buffer_start;
+
+  /* Index the current writing position (end of the buffer plus one). */
+  int buffer_end;
+
+  /* Ring buffer used to lookup the last token. */
+  char buffer[JSPARSER_RING_BUFFER_SIZE];
+
+} jsparser_ctx;
+
+
+void jsparser_reset(jsparser_ctx *ctx);
+jsparser_ctx *jsparser_new(void);
+
+/* Returns a pointer to a context which is a duplicate of the jsparser src.
+ */
+jsparser_ctx *jsparser_duplicate(jsparser_ctx *src);
+
+/* Copies the context of the jsparser pointed to by src to the jsparser dst.
+ */
+void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src);
+int jsparser_state(jsparser_ctx *ctx);
+int jsparser_parse(jsparser_ctx *ctx, const char *str, int size);
+
+void jsparser_delete(jsparser_ctx *ctx);
+
+/**
+ * Ring buffer functions.
+ *
+ * These functions are only exported for testing and should not be called from
+ * outside of jsparser.c in production code.
+ */
+
+/* Appends a character to the ring buffer.
+ *
+ * Sequences of whitespaces and newlines are folded into one character.
+ */
+void jsparser_buffer_append_chr(jsparser_ctx *js, char chr);
+
+/* Appends a string to the ring buffer.
+ *
+ * Sequences of whitespaces and newlines are folded into one character.
+ */
+void jsparser_buffer_append_str(jsparser_ctx *js, const char *str);
+
+/* Returns the last appended character and removes it from the buffer. If the
+ * buffer is empty, then it returns ASCII 0 ('\0').
+ */
+char jsparser_buffer_pop(jsparser_ctx *js);
+
+/* Returns the value of the character at a certain index in the buffer or an
+ * ASCII 0 ('\0') character if the index is extends beyond the size of the
+ * buffer, either because we don't have as many characters in the buffer, or
+ * because the index points to a place bigger than the size of the buffer..
+ *
+ * Index positions must be negative, where -1 is the last character appended to
+ * the buffer.
+ */
+char jsparser_buffer_get(jsparser_ctx *js, int pos);
+
+/* Sets the value of the character at a certain index in the buffer. Returns
+ * true if the write was successful or false if there was an attempt to write
+ * outside of the buffer boundaries.
+ *
+ * Index positions are negative, were -1 is the last character appended to the
+ * buffer. Using positive integers for the index will result in undefined
+ * behaviour.
+ */
+int jsparser_buffer_set(jsparser_ctx *js, int pos, char value);
+
+/* Copies a slice of the buffer to the string pointed to by output. start and
+ * end are the indexes of the sliced region. If the start argument extends
+ * beyond the beginning of the buffer, the slice will only contain characters
+ * starting from beginning of the buffer.
+ */
+void jsparser_buffer_slice(jsparser_ctx *js, char *buffer, int start, int end);
+
+/* Copy the last javascript identifier or keyword found in the buffer to the
+ * string pointed by identifier.
+ */
+int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier);
+
+
+#define jsparser_parse_chr(a,b) jsparser_parse(a, &(b), 1);
+#ifdef __cplusplus
+#define jsparser_parse_str(a,b) jsparser_parse(a, b, \
+                                               static_cast<int>(strlen(b)));
+#else
+#define jsparser_parse_str(a,b) jsparser_parse(a, b, (int)strlen(b));
+#endif
+
+#endif /* STREAMHTMLPARSER_JSPARSER_H */
diff --git a/streamhtmlparser/statemachine.h b/streamhtmlparser/statemachine.h
new file mode 100644
index 0000000..a05ffe7
--- /dev/null
+++ b/streamhtmlparser/statemachine.h
@@ -0,0 +1,224 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Filipe Almeida
+ */
+
+#ifndef STREAMHTMLPARSER_STATEMACHINE_H
+#define STREAMHTMLPARSER_STATEMACHINE_H
+
+/* TODO(falmeida): I'm not sure about these limits, but since right now we only
+ * have 24 states it should be fine */
+
+enum {
+    STATEMACHINE_ERROR = 127
+};
+
+#define STATEMACHINE_RECORD_BUFFER_SIZE 256
+
+#define STATEMACHINE_MAX_STR_ERROR 80
+
+struct statemachine_ctx_s;
+
+typedef void(*state_event_function)(struct statemachine_ctx_s *, int, char,
+                                    int);
+
+typedef struct statemachine_definition_s {
+    int num_states;
+    const int* const* transition_table;
+
+    /* Array containing the name of the states as a C string.
+     * This field is optional and if not in use it should be set to NULL.
+     */
+    const char* const* state_names;
+    state_event_function *in_state_events;
+    state_event_function *enter_state_events;
+    state_event_function *exit_state_events;
+} statemachine_definition;
+
+typedef struct statemachine_ctx_s {
+    int current_state;
+    int next_state;
+    statemachine_definition *definition;
+    char current_char;
+
+    /* Current line number. */
+    int line_number;
+
+    /* Current column number. */
+    int column_number;
+    char record_buffer[STATEMACHINE_RECORD_BUFFER_SIZE];
+    size_t record_pos;
+
+    /* True if we are recording the stream to record_buffer. */
+    int recording;
+
+    /* In case there was an error (we are in state STATEMACHINE_ERROR), it will
+     * contain a human readable description of the error.
+     */
+    char error_msg[STATEMACHINE_MAX_STR_ERROR];
+
+    /* Storage space for the layer above. */
+    void *user;
+} statemachine_ctx;
+
+/* Populates the statemachine definition.
+ *
+ * Receives a transition table and an optional array of state names. It uses
+ * this data to populate the state machine definition.
+ *
+ * The transition table structure is a list of lists of ints (int **). The
+ * outer list indexes the source state and the inner list contains the
+ * destination state for each of the possible input characters:
+ *
+ * const int* const* transitions[source][input] == destination.
+ *
+ * The optional argument state_names points to a list of strings containing
+ * human readable state names. These strings are used when reporting error
+ * messages.
+ */
+void statemachine_definition_populate(statemachine_definition *def,
+                                     const int* const* transition_table,
+                                     const char* const* state_names);
+
+void statemachine_in_state(statemachine_definition *def, int st,
+                           state_event_function func);
+void statemachine_enter_state(statemachine_definition *def, int st,
+                                     state_event_function func);
+void statemachine_exit_state(statemachine_definition *def, int st,
+                                    state_event_function func);
+
+statemachine_definition *statemachine_definition_new(int states);
+void statemachine_definition_delete(statemachine_definition *def);
+
+int statemachine_get_state(statemachine_ctx *ctx);
+void statemachine_set_state(statemachine_ctx *ctx, int state);
+
+void statemachine_start_record(statemachine_ctx *ctx);
+const char *statemachine_stop_record(statemachine_ctx *ctx);
+const char *statemachine_record_buffer(statemachine_ctx *ctx);
+
+/* Returns the the number of characters currently stored in the record buffer.
+ */
+static inline size_t statemachine_record_length(statemachine_ctx *ctx) {
+  return ctx->record_pos + 1;
+}
+
+/* Return the current line number. */
+static inline int statemachine_get_line_number(statemachine_ctx *ctx) {
+  return ctx->line_number;
+}
+
+/* Set the current line number. */
+static inline void statemachine_set_line_number(statemachine_ctx *ctx,
+                                                int line) {
+  ctx->line_number = line;
+}
+
+/* Return the current column number. */
+static inline int statemachine_get_column_number(statemachine_ctx *ctx) {
+  return ctx->column_number;
+}
+
+/* Set the current column number. */
+static inline void statemachine_set_column_number(statemachine_ctx *ctx,
+                                                  int column) {
+  ctx->column_number = column;
+}
+
+
+/* Retrieve a human readable error message in case an error occurred.
+ *
+ * NULL is returned if the parser didn't encounter an error.
+ */
+static inline const char *statemachine_get_error_msg(statemachine_ctx *ctx) {
+  if (ctx->next_state == STATEMACHINE_ERROR) {
+    return ctx->error_msg;
+  } else {
+    return NULL;
+  }
+}
+
+/* Reset the statemachine.
+ *
+ * The state is set to the initialization values. This includes setting the
+ * state to the default state (0), stopping recording and setting the line
+ * number to 1.
+ */
+void statemachine_reset(statemachine_ctx *ctx);
+
+/* Initializes a new statemachine. Receives a statemachine definition object
+ * that should have been initialized with statemachine_definition_new() and a
+ * user reference to be used by the caller.
+ *
+ * Returns NULL if initialization fails.
+ *
+ * Initialization failure is fatal, and if this function fails it may not
+ * deallocate all previsouly allocated memory.
+ */
+statemachine_ctx *statemachine_new(statemachine_definition *def,
+                                   void *user);
+
+/* Returns a pointer to a context which is a duplicate of the statemachine src.
+ * The statemachine definition and the user pointer have to be provided since
+ * these references are not owned by the statemachine itself.
+ */
+statemachine_ctx *statemachine_duplicate(statemachine_ctx *ctx,
+                                         statemachine_definition *def,
+                                         void *user);
+
+/* Copies the context of the statemachine pointed to by src to the statemachine
+ * provided by dst.
+ * The statemachine definition and the user pointer have to be provided since
+ * these references are not owned by the statemachine itself.
+ */
+void statemachine_copy(statemachine_ctx *dst,
+                       statemachine_ctx *src,
+                       statemachine_definition *def,
+                       void *user);
+
+int statemachine_parse(statemachine_ctx *ctx, const char *str, int size);
+
+void statemachine_delete(statemachine_ctx *ctx);
+
+
+/*****
+ * The following functions are only exported for testing purposes and should
+ * be treated as private. */
+
+
+/* Encode the character as an escaped C string.
+ *
+ * Encode the character chr into the string output. Writes at most len
+ * characters to the output string but makes sure output is NULL terminated.
+ */
+void statemachine_encode_char(char chr, char *output, size_t len);
+
+#endif /* STREAMHTMLPARSER_STATEMACHINE_H */
-- 
cgit v1.2.3-54-g00ecf