summaryrefslogtreecommitdiff
path: root/streamhtmlparser/include/htmlparser_cpp.h
diff options
context:
space:
mode:
Diffstat (limited to 'streamhtmlparser/include/htmlparser_cpp.h')
-rw-r--r--streamhtmlparser/include/htmlparser_cpp.h322
1 files changed, 322 insertions, 0 deletions
diff --git a/streamhtmlparser/include/htmlparser_cpp.h b/streamhtmlparser/include/htmlparser_cpp.h
new file mode 100644
index 0000000..3802233
--- /dev/null
+++ b/streamhtmlparser/include/htmlparser_cpp.h
@@ -0,0 +1,322 @@
+// Copyright (c) 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ---
+// Author: Filipe Almeida
+//
+// c++ bindings for htmlparser.
+
+#ifndef STREAMHTMLPARSER_HTMLPARSER_CPP_H__
+#define STREAMHTMLPARSER_HTMLPARSER_CPP_H__
+
+#include <string>
+#include <assert.h>
+extern "C" {
+ #include "htmlparser.h"
+ #include "jsparser.h"
+}
+
+namespace streamhtmlparser {
+
+class JavascriptParser {
+ public:
+ enum State {
+ STATE_TEXT = JSPARSER_STATE_TEXT,
+ STATE_Q = JSPARSER_STATE_Q,
+ STATE_DQ = JSPARSER_STATE_DQ,
+ STATE_REGEXP = JSPARSER_STATE_REGEXP,
+ STATE_COMMENT = JSPARSER_STATE_COMMENT
+ };
+};
+
+class HtmlParser {
+ public:
+
+ /* html states */
+ enum State {
+ STATE_TEXT = HTMLPARSER_STATE_TEXT,
+ STATE_TAG = HTMLPARSER_STATE_TAG,
+ STATE_ATTR = HTMLPARSER_STATE_ATTR,
+ STATE_VALUE = HTMLPARSER_STATE_VALUE,
+ STATE_COMMENT = HTMLPARSER_STATE_COMMENT,
+ STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE,
+ STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE,
+ STATE_ERROR = HTMLPARSER_STATE_ERROR
+ };
+
+ /* attribute types */
+ enum AttributeType {
+ ATTR_NONE = HTMLPARSER_ATTR_NONE,
+ ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR,
+ ATTR_URI = HTMLPARSER_ATTR_URI,
+ ATTR_JS = HTMLPARSER_ATTR_JS,
+ ATTR_STYLE = HTMLPARSER_ATTR_STYLE
+ };
+
+ /* Parser modes */
+ enum Mode {
+ MODE_HTML = HTMLPARSER_MODE_HTML,
+ MODE_JS = HTMLPARSER_MODE_JS,
+ MODE_CSS = HTMLPARSER_MODE_CSS,
+ MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG
+ };
+
+ HtmlParser() {
+ parser_ = htmlparser_new();
+ assert(parser_ != NULL);
+ };
+
+ /* Parses the input html stream and returns the finishing state.
+ *
+ * Returns HtmlParser::STATE_ERROR if unable to parse the input. If
+ * htmlparser_parse() is called after an error situation was encountered
+ * the behaviour is unspecified. At this point, Reset() or ResetMode()
+ * can be called to reset the state so it can be used to parse a new file.
+ */
+ int Parse(const char *str, int len) {
+ return htmlparser_parse(parser_, str, len);
+ };
+
+ int Parse(const std::string &str) {
+ return Parse(str.c_str(), static_cast<int>(str.length()));
+ };
+
+ /* Returns the current state the parser is in */
+ int state() const {
+ return htmlparser_state(parser_);
+ };
+
+ /* Returns the current tag or NULL if not available.
+ *
+ * There is no stack implemented because we currently don't have a need for
+ * it, which means tag names are tracked only one level deep.
+ *
+ * This is better understood by looking at the following example:
+ *
+ * <b [tag=b]>
+ * [tag=b]
+ * <i>
+ * [tag=i]
+ * </i>
+ * [tag=NULL]
+ * </b>
+ *
+ * The tag is correctly filled inside the tag itself and before any new
+ * inner tag is closed, at which point the tag will be set to NULL.
+ *
+ * For our current purposes this is not a problem, but we may implement a
+ * tag tracking stack in the future for completeness.
+ */
+ const char *tag() const {
+ return htmlparser_tag(parser_);
+ }
+
+ /* Returns the current attribute name if inside an attribute name or an
+ * attribute value. Returns NULL otherwise. */
+ const char *attribute() const {
+ return htmlparser_attr(parser_);
+ }
+
+ /* Returns the contents of the current attribute value. */
+ const char *value() const {
+ return htmlparser_value(parser_);
+ }
+
+ /* Returns true if inside javascript. This can be a javascript block, a
+ * javascript attribute value or the parser may just be in javascript mode
+ * (HtmlParser::MODE_JS) */
+ bool InJavascript() const {
+ return static_cast<bool>(htmlparser_in_js(parser_));
+ }
+
+ /* Returns true if the parser is currently inside a CSS construct.
+ *
+ * Currently this can be either a STYLE tag, a STYLE attribute or the fact
+ * that the parser was reset using MODE_CSS using ResetMode().
+ */
+ bool InCss() const {
+ return static_cast<bool>(htmlparser_in_css(parser_));
+ }
+
+ /* Returns true if the current attribute is quoted */
+ bool IsAttributeQuoted() const {
+ return static_cast<bool>(htmlparser_is_attr_quoted(parser_));
+ }
+
+ /* Returns true if the parser is inside a js string literal.
+ */
+ bool IsJavascriptQuoted() const {
+ return static_cast<bool>(htmlparser_is_js_quoted(parser_));
+ }
+
+ /* Returns the index within the current value or -1 if the parser is not
+ * inside an attribute value */
+ int ValueIndex() const {
+ return htmlparser_value_index(parser_);
+ }
+
+ /* Returns true if this is the first character of a url inside an attribute.
+ *
+ * This function can be used by an html sanitizer or auto escaping system as
+ * a hint that it should validate the url for a whitelist of protocol
+ * handlers and for well-formedness, or that it should just escape a
+ * component of it.
+ *
+ * For attributes that expect a url this will return true if we are at the
+ * first character of the attribute, but for the special case of a meta
+ * redirect tag some analysis is made in order to verify if we are at the
+ * start of a url or not.
+ *
+ * For any other attributes, the result will always be false.
+ *
+ */
+ bool IsUrlStart() const {
+ return htmlparser_is_url_start(parser_);
+ }
+
+ /* Returns the current attribute type.
+ *
+ * The attribute type can be one of:
+ * ATTR_NONE - not inside an attribute
+ * ATTR_REGULAR - Inside a normal attribute
+ * ATTR_URI - Inside an attribute that accepts a uri
+ * ATTR_JS - Inside a javascript attribute
+ * ATTR_STYLE - Inside a css style attribute
+ * */
+ int AttributeType() const {
+ return htmlparser_attr_type(parser_);
+ }
+
+ /* Return the current line number. */
+ int line_number() const {
+ return htmlparser_get_line_number(parser_);
+ }
+
+ /* Set the current line number. */
+ void set_line_number(int line) {
+ return htmlparser_set_line_number(parser_, line);
+ }
+
+ /* Return the current column number. */
+ int column_number() const {
+ return htmlparser_get_column_number(parser_);
+ }
+
+ /* Set the current line number. */
+ void set_column_number(int column) {
+ return htmlparser_set_column_number(parser_, column);
+ }
+
+ /* Retrieve a human readable error message in case an error occurred.
+ *
+ * NULL is returned if the parser didn't encounter an error.
+ */
+ const char *GetErrorMessage() {
+ return htmlparser_get_error_msg(parser_);
+ }
+
+ /* Returns the current state the javascript parser is in.
+ *
+ * Should only be used for testing.
+ */
+ int javascript_state() const {
+ return htmlparser_js_state(parser_);
+ };
+
+ /* Resets the parser to it's initial state and changes the parser mode.
+ *
+ * Internal state (tag name, attribute name, state of statemachine) is
+ * reset as * though the object was just created.
+ *
+ * Available modes:
+ * MODE_HTML - Parses html text
+ * MODE_JS - Parses javascript files
+ * MODE_CSS - Parses CSS files. No actual parsing is actually done
+ * but InCss() always returns true.
+ * MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To
+ * be used in a template expanded in the
+ * following context: <a $template>
+ */
+ void ResetMode(enum Mode mode) {
+ return htmlparser_reset_mode(parser_, mode);
+ }
+
+ /* Resets the parser to it's initial state and to the default mode, which is
+ * MODE_HTML.
+ *
+ * All internal context like tag name, attribute name or the state of the
+ * statemachine are reset to it's original values as if the object was just
+ * created.
+ */
+ void Reset() {
+ return htmlparser_reset(parser_);
+ }
+
+ /* Invoked when text is inserted by the caller.
+ *
+ * Should be called before a template directive that expands to content is
+ * found. This changes the current state by following the default rule,
+ * ensuring we stay in sync with template.
+ *
+ * Returns true if template directives are accepted for this state and
+ * false if they are not, which should result in an error condition.
+ *
+ * Right now the only case being handled are unquoted attribute values and
+ * it always returns true. In the future we can handle more cases and
+ * restrict the states were we allow template directives by returning false
+ * for those.
+ */
+ bool InsertText() {
+ return static_cast<bool>(htmlparser_insert_text(parser_));
+ }
+
+ /* Copies the context of the HtmlParser object referenced in source to the
+ * current object.
+ */
+ void CopyFrom(const HtmlParser *source) {
+ assert(this != source);
+ assert(source != NULL);
+ htmlparser_copy(parser_, source->parser_);
+ }
+
+ ~HtmlParser() {
+ htmlparser_delete(parser_);
+ };
+
+
+ private:
+ htmlparser_ctx *parser_;
+ HtmlParser(const HtmlParser&); // disallow copy
+ void operator=(const HtmlParser&); // and assign
+
+};
+
+}
+
+#endif // STREAMHTMLPARSER_HTMLPARSER_CPP_H__