diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 22:30:39 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 22:30:39 +0200 |
commit | 2db452d3d57df4b91375c0176e3a9527dbbc537c (patch) | |
tree | 96289944f5195438c83895c31a8e530efbd4045a /streamhtmlparser/htmlparser_cpp.h | |
parent | 3c903a9a1784edc375119f1db7992e94765c0fbf (diff) | |
download | crawler-2db452d3d57df4b91375c0176e3a9527dbbc537c.tar.gz crawler-2db452d3d57df4b91375c0176e3a9527dbbc537c.tar.bz2 |
first working crawler
Diffstat (limited to 'streamhtmlparser/htmlparser_cpp.h')
-rw-r--r-- | streamhtmlparser/htmlparser_cpp.h | 322 |
1 files changed, 322 insertions, 0 deletions
diff --git a/streamhtmlparser/htmlparser_cpp.h b/streamhtmlparser/htmlparser_cpp.h new file mode 100644 index 0000000..3802233 --- /dev/null +++ b/streamhtmlparser/htmlparser_cpp.h @@ -0,0 +1,322 @@ +// Copyright (c) 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// --- +// Author: Filipe Almeida +// +// c++ bindings for htmlparser. + +#ifndef STREAMHTMLPARSER_HTMLPARSER_CPP_H__ +#define STREAMHTMLPARSER_HTMLPARSER_CPP_H__ + +#include <string> +#include <assert.h> +extern "C" { + #include "htmlparser.h" + #include "jsparser.h" +} + +namespace streamhtmlparser { + +class JavascriptParser { + public: + enum State { + STATE_TEXT = JSPARSER_STATE_TEXT, + STATE_Q = JSPARSER_STATE_Q, + STATE_DQ = JSPARSER_STATE_DQ, + STATE_REGEXP = JSPARSER_STATE_REGEXP, + STATE_COMMENT = JSPARSER_STATE_COMMENT + }; +}; + +class HtmlParser { + public: + + /* html states */ + enum State { + STATE_TEXT = HTMLPARSER_STATE_TEXT, + STATE_TAG = HTMLPARSER_STATE_TAG, + STATE_ATTR = HTMLPARSER_STATE_ATTR, + STATE_VALUE = HTMLPARSER_STATE_VALUE, + STATE_COMMENT = HTMLPARSER_STATE_COMMENT, + STATE_JS_FILE = HTMLPARSER_STATE_JS_FILE, + STATE_CSS_FILE = HTMLPARSER_STATE_CSS_FILE, + STATE_ERROR = HTMLPARSER_STATE_ERROR + }; + + /* attribute types */ + enum AttributeType { + ATTR_NONE = HTMLPARSER_ATTR_NONE, + ATTR_REGULAR = HTMLPARSER_ATTR_REGULAR, + ATTR_URI = HTMLPARSER_ATTR_URI, + ATTR_JS = HTMLPARSER_ATTR_JS, + ATTR_STYLE = HTMLPARSER_ATTR_STYLE + }; + + /* Parser modes */ + enum Mode { + MODE_HTML = HTMLPARSER_MODE_HTML, + MODE_JS = HTMLPARSER_MODE_JS, + MODE_CSS = HTMLPARSER_MODE_CSS, + MODE_HTML_IN_TAG = HTMLPARSER_MODE_HTML_IN_TAG + }; + + HtmlParser() { + parser_ = htmlparser_new(); + assert(parser_ != NULL); + }; + + /* Parses the input html stream and returns the finishing state. + * + * Returns HtmlParser::STATE_ERROR if unable to parse the input. If + * htmlparser_parse() is called after an error situation was encountered + * the behaviour is unspecified. At this point, Reset() or ResetMode() + * can be called to reset the state so it can be used to parse a new file. + */ + int Parse(const char *str, int len) { + return htmlparser_parse(parser_, str, len); + }; + + int Parse(const std::string &str) { + return Parse(str.c_str(), static_cast<int>(str.length())); + }; + + /* Returns the current state the parser is in */ + int state() const { + return htmlparser_state(parser_); + }; + + /* Returns the current tag or NULL if not available. + * + * There is no stack implemented because we currently don't have a need for + * it, which means tag names are tracked only one level deep. + * + * This is better understood by looking at the following example: + * + * <b [tag=b]> + * [tag=b] + * <i> + * [tag=i] + * </i> + * [tag=NULL] + * </b> + * + * The tag is correctly filled inside the tag itself and before any new + * inner tag is closed, at which point the tag will be set to NULL. + * + * For our current purposes this is not a problem, but we may implement a + * tag tracking stack in the future for completeness. + */ + const char *tag() const { + return htmlparser_tag(parser_); + } + + /* Returns the current attribute name if inside an attribute name or an + * attribute value. Returns NULL otherwise. */ + const char *attribute() const { + return htmlparser_attr(parser_); + } + + /* Returns the contents of the current attribute value. */ + const char *value() const { + return htmlparser_value(parser_); + } + + /* Returns true if inside javascript. This can be a javascript block, a + * javascript attribute value or the parser may just be in javascript mode + * (HtmlParser::MODE_JS) */ + bool InJavascript() const { + return static_cast<bool>(htmlparser_in_js(parser_)); + } + + /* Returns true if the parser is currently inside a CSS construct. + * + * Currently this can be either a STYLE tag, a STYLE attribute or the fact + * that the parser was reset using MODE_CSS using ResetMode(). + */ + bool InCss() const { + return static_cast<bool>(htmlparser_in_css(parser_)); + } + + /* Returns true if the current attribute is quoted */ + bool IsAttributeQuoted() const { + return static_cast<bool>(htmlparser_is_attr_quoted(parser_)); + } + + /* Returns true if the parser is inside a js string literal. + */ + bool IsJavascriptQuoted() const { + return static_cast<bool>(htmlparser_is_js_quoted(parser_)); + } + + /* Returns the index within the current value or -1 if the parser is not + * inside an attribute value */ + int ValueIndex() const { + return htmlparser_value_index(parser_); + } + + /* Returns true if this is the first character of a url inside an attribute. + * + * This function can be used by an html sanitizer or auto escaping system as + * a hint that it should validate the url for a whitelist of protocol + * handlers and for well-formedness, or that it should just escape a + * component of it. + * + * For attributes that expect a url this will return true if we are at the + * first character of the attribute, but for the special case of a meta + * redirect tag some analysis is made in order to verify if we are at the + * start of a url or not. + * + * For any other attributes, the result will always be false. + * + */ + bool IsUrlStart() const { + return htmlparser_is_url_start(parser_); + } + + /* Returns the current attribute type. + * + * The attribute type can be one of: + * ATTR_NONE - not inside an attribute + * ATTR_REGULAR - Inside a normal attribute + * ATTR_URI - Inside an attribute that accepts a uri + * ATTR_JS - Inside a javascript attribute + * ATTR_STYLE - Inside a css style attribute + * */ + int AttributeType() const { + return htmlparser_attr_type(parser_); + } + + /* Return the current line number. */ + int line_number() const { + return htmlparser_get_line_number(parser_); + } + + /* Set the current line number. */ + void set_line_number(int line) { + return htmlparser_set_line_number(parser_, line); + } + + /* Return the current column number. */ + int column_number() const { + return htmlparser_get_column_number(parser_); + } + + /* Set the current line number. */ + void set_column_number(int column) { + return htmlparser_set_column_number(parser_, column); + } + + /* Retrieve a human readable error message in case an error occurred. + * + * NULL is returned if the parser didn't encounter an error. + */ + const char *GetErrorMessage() { + return htmlparser_get_error_msg(parser_); + } + + /* Returns the current state the javascript parser is in. + * + * Should only be used for testing. + */ + int javascript_state() const { + return htmlparser_js_state(parser_); + }; + + /* Resets the parser to it's initial state and changes the parser mode. + * + * Internal state (tag name, attribute name, state of statemachine) is + * reset as * though the object was just created. + * + * Available modes: + * MODE_HTML - Parses html text + * MODE_JS - Parses javascript files + * MODE_CSS - Parses CSS files. No actual parsing is actually done + * but InCss() always returns true. + * MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To + * be used in a template expanded in the + * following context: <a $template> + */ + void ResetMode(enum Mode mode) { + return htmlparser_reset_mode(parser_, mode); + } + + /* Resets the parser to it's initial state and to the default mode, which is + * MODE_HTML. + * + * All internal context like tag name, attribute name or the state of the + * statemachine are reset to it's original values as if the object was just + * created. + */ + void Reset() { + return htmlparser_reset(parser_); + } + + /* Invoked when text is inserted by the caller. + * + * Should be called before a template directive that expands to content is + * found. This changes the current state by following the default rule, + * ensuring we stay in sync with template. + * + * Returns true if template directives are accepted for this state and + * false if they are not, which should result in an error condition. + * + * Right now the only case being handled are unquoted attribute values and + * it always returns true. In the future we can handle more cases and + * restrict the states were we allow template directives by returning false + * for those. + */ + bool InsertText() { + return static_cast<bool>(htmlparser_insert_text(parser_)); + } + + /* Copies the context of the HtmlParser object referenced in source to the + * current object. + */ + void CopyFrom(const HtmlParser *source) { + assert(this != source); + assert(source != NULL); + htmlparser_copy(parser_, source->parser_); + } + + ~HtmlParser() { + htmlparser_delete(parser_); + }; + + + private: + htmlparser_ctx *parser_; + HtmlParser(const HtmlParser&); // disallow copy + void operator=(const HtmlParser&); // and assign + +}; + +} + +#endif // STREAMHTMLPARSER_HTMLPARSER_CPP_H__ |