/* Copyright (c) 2007, Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * --- * Author: Filipe Almeida */ #ifndef STREAMHTMLPARSER_HTMLPARSER_H #define STREAMHTMLPARSER_HTMLPARSER_H #include "statemachine.h" #include "jsparser.h" /* entity filter */ /* String sizes used in htmlparser and entityfilter structures including the * NULL terminator. */ #define HTMLPARSER_MAX_STRING STATEMACHINE_RECORD_BUFFER_SIZE #define HTMLPARSER_MAX_ENTITY_SIZE 10 enum htmlparser_state_external_enum { HTMLPARSER_STATE_TEXT, HTMLPARSER_STATE_TAG, HTMLPARSER_STATE_ATTR, HTMLPARSER_STATE_VALUE, HTMLPARSER_STATE_COMMENT, HTMLPARSER_STATE_JS_FILE, HTMLPARSER_STATE_CSS_FILE, HTMLPARSER_STATE_ERROR }; enum htmlparser_mode { HTMLPARSER_MODE_HTML, HTMLPARSER_MODE_JS, HTMLPARSER_MODE_CSS, HTMLPARSER_MODE_HTML_IN_TAG }; enum htmlparser_attr_type { HTMLPARSER_ATTR_NONE, HTMLPARSER_ATTR_REGULAR, HTMLPARSER_ATTR_URI, HTMLPARSER_ATTR_JS, HTMLPARSER_ATTR_STYLE }; /* TODO(falmeida): Maybe move some of these declaration to the .c and only keep * a forward declaration in here, since these structures are private. */ /* entityfilter context structure. * * The entity filter collection of routines provide a way to decode html * entities from an html document in a streaming way. * * The html_process() function receives a character at a time from the input * stream and returns 0 or more characters which should be appended to the * resulting decoded document. * * Currently this collection of functions are only exported for testing purposes * and shouldn't be called from outside of htmlparser.c. * * Since we really only use these functions with the very specific purpose of * decoding html entities for javascript attributes, only a small subset of * entities are supported: <, >, "e;, &, &apos, and the numeric * character references for both decimal and hexadecimal. */ typedef struct entityfilter_ctx_s { /* Current position into the buffer. */ int buffer_pos; /* True if currently processing an html entity. */ int in_entity; /* Temporary character buffer that is used while processing html entities. */ char buffer[HTMLPARSER_MAX_ENTITY_SIZE]; /* String buffer returned to the application after we decoded an html * entity. */ char output[HTMLPARSER_MAX_ENTITY_SIZE]; } entityfilter_ctx; /* Resets the entityfilter to its initial state so it can be reused. */ void entityfilter_reset(entityfilter_ctx *ctx); /* Initializes a new entity filter object. */ entityfilter_ctx *entityfilter_new(void); /* Deallocates an entity filter object. */ void entityfilter_delete(entityfilter_ctx *ctx); /* Copies the context of the entityfilter pointed to by src to the entityfilter * dst. */ void entityfilter_copy(entityfilter_ctx *dst, entityfilter_ctx *src); /* Processes a character from the input stream and decodes any html entities * in the accumulated buffer. * * Returns a reference to a string that points to an internal buffer. This * buffer will be changed after every call to entityfilter_process(). As * such this string should be duplicated before subsequent calls to * entityfilter_process(). */ const char *entityfilter_process(entityfilter_ctx *ctx, char c); /* html parser */ /* Stores the context of the html parser. * If this structure is changed, htmlparser_new(), htmlparser_copy() and * htmlparser_reset() should be updated accordingly. */ typedef struct htmlparser_ctx_s { /* Holds a reference to the statemachine context. */ statemachine_ctx *statemachine; /* Holds a reference to the statemachine definition in use. Right now this is * only used so we can deallocate it at the end. * * It should be readonly and contain the same values across jsparser * instances. */ /* TODO(falmeida): Change statemachine_def to const. */ statemachine_definition *statemachine_def; /* Holds a reference to the javascript parser. */ jsparser_ctx *jsparser; /* Holds a reference to the entity filter. Used for decoding html entities * inside javascript attributes. */ entityfilter_ctx *entityfilter; /* Offset into the current attribute value where 0 is the first character in * the value. */ int value_index; /* True if currently processing javascript. */ int in_js; /* Current tag name. */ char tag[HTMLPARSER_MAX_STRING]; /* Current attribute name. */ char attr[HTMLPARSER_MAX_STRING]; /* Contents of the current value capped to HTMLPARSER_MAX_STRING. */ char value[HTMLPARSER_MAX_STRING]; } htmlparser_ctx; /* Resets the parser to its initial state and to the default mode, which * is MODE_HTML. * * All internal context like tag name, attribute name or the state of the * statemachine are reset to its original values as if the object was just * created. */ void htmlparser_reset(htmlparser_ctx *ctx); /* Resets the parser to its initial state and changes the parser mode. * All internal context like tag name, attribute name or the state of the * statemachine are reset to their original values as if the object was just * created. * * Available modes: * HTMLPARSER_MODE_HTML - Parses html text * HTMLPARSER_MODE_JS - Parses javascript files * HTMLPARSER_MODE_CSS - Parses CSS files. No actual parsing is actually done * but htmlparser_in_css() always returns true. * HTMLPARSER_MODE_HTML_IN_TAG - Parses an attribute list inside a tag. To * be used in a template expanded in the * following context: * */ void htmlparser_reset_mode(htmlparser_ctx *ctx, int mode); /* Initializes a new htmlparser instance. * * Returns a pointer to the new instance or NULL if the initialization fails. * Initialization failure is fatal, and if this function fails it may not * deallocate all previsouly allocated memory. */ htmlparser_ctx *htmlparser_new(void); /* Copies the context of the htmlparser pointed to by src to the htmlparser dst. * * Also copies over the instances of the state machine, the jsparser and the * entity filter but not the statemachine definition since this one is read * only. */ void htmlparser_copy(htmlparser_ctx *dst, const htmlparser_ctx *src); /* Receives an htmlparser context and returns the current html state. * * The return value will be one of the states of htmlparser_state_external_enum. */ int htmlparser_state(htmlparser_ctx *ctx); /* Parses the input html stream and returns the finishing state. * * Returns HTMLPARSER_ERROR if unable to parse the input. If htmlparser_parse() * is called after an error situation was encountered the behaviour is * unspecified. At this point, htmlparser_reset() or htmlparser_reset_mode() * can be called to reset the state. */ int htmlparser_parse(htmlparser_ctx *ctx, const char *str, int size); /* Returns true if the parser is inside an attribute value and the value is * surrounded by single or double quotes. */ int htmlparser_is_attr_quoted(htmlparser_ctx *ctx); /* Returns true if the parser is currently in javascript. This can be a * an attribute that takes javascript, a javascript block or the parser * can just be in MODE_JS. */ int htmlparser_in_js(htmlparser_ctx *ctx); /* Returns the current tag or NULL if not available or we haven't seen the * entire tag yet. * * There is no stack implemented because we currently don't have a need for * it, which means tag names are tracked only one level deep. * * This is better understood by looking at the following example: * * * [tag=b] * * [tag=i] * * [tag=NULL] * * * The tag is correctly filled inside the tag itself and before any new inner * tag is closed, at which point the tag will be null. * * For our current purposes this is not a problem, but we may implement a tag * tracking stack in the future for completeness. * */ const char *htmlparser_tag(htmlparser_ctx *ctx); /* Returns the current attribute name if after an attribute name or in an * attribute value. Returns NULL otherwise. */ const char *htmlparser_attr(htmlparser_ctx *ctx); /* Returns the contents of the current attribute value. * * Returns NULL if not inside an attribute value. */ const char *htmlparser_value(htmlparser_ctx *ctx); /* Returns true if the parser is currently inside a CSS construct. * * Currently this can be either a STYLE tag, a STYLE attribute or the fact that * the parser was reset in HTMLPARSER_MODE_CSS using * htmlparser_reset_mode(). */ int htmlparser_in_css(htmlparser_ctx *ctx); /* Returns the current state of the javascript state machine. * * Currently only present for testing purposes. */ int htmlparser_js_state(htmlparser_ctx *ctx); /* Returns non-zero if currently inside a javascript string literal and zero * otherwise. */ int htmlparser_is_js_quoted(htmlparser_ctx *ctx); /* Returns non-zero if currently inside an attribute value and zero otherwise. */ int htmlparser_value_index(htmlparser_ctx *ctx); /* Returns true if this is the first character of a url inside an attribute. * * This function can be used by an html sanitizer or auto escaping system as a * hint that it should validate the url for a whitelist of protocol handlers and * for well-formedness, or that it should just escape a component of it. * * For attributes that expect a URL, this will return true if we are at the * first character of the URL, false otherwise. * For most attributes, this is the same as checking that we are at the first * character of the attribute value but it also works correctly for the * "content" attribute of the "meta" tag where the URL follows some earlier * content. * e.g: * * For any other attributes, the result will always be false. */ int htmlparser_is_url_start(htmlparser_ctx *ctx); /* Returns the current attribute type. * * The attribute type can be one of: * HTMLPARSER_ATTR_NONE - not inside an attribute. * HTMLPARSER_ATTR_REGULAR - Inside a normal attribute. * HTMLPARSER_ATTR_URI - Inside an attribute that accepts a uri. * HTMLPARSER_ATTR_JS - Inside a javascript attribute. * HTMLPARSER_ATTR_STYLE - Inside a css style attribute. */ int htmlparser_attr_type(htmlparser_ctx *ctx); /* Return the current line number. */ int htmlparser_get_line_number(htmlparser_ctx *ctx); /* Set the current line number. */ void htmlparser_set_line_number(htmlparser_ctx *ctx, int line); /* Return the current column number. */ int htmlparser_get_column_number(htmlparser_ctx *ctx); /* Set the current column number. */ void htmlparser_set_column_number(htmlparser_ctx *ctx, int column); /* Retrieve a human readable error message in case an error occurred. * * NULL is returned if the parser didn't encounter an error. */ const char *htmlparser_get_error_msg(htmlparser_ctx *ctx); /* Invoked by the caller when text is expanded by the caller. * * Should be invoked when a template directive that expands to content is * executed but we don't provide this content to the parser itself. This changes * the current state by following the default rule, ensuring we stay in sync * with the template. * * Returns 1 if template directives are accepted for this state and 0 if they * are not, which should result in an error condition. * * Right now the only case being handled are unquoted attribute values and it * always returns 1. When insert_text() is called after the equals sign, we * assume some text was consumed and we are now in the middle of the attribute * value itself. Example: * * * * The template calls insert_text() when it encounters $HREF_VALUE. If it didn't * the parser would only have seen the following html: * * * * and would interpret alt=alternate_text as the value of the href attribute. */ int htmlparser_insert_text(htmlparser_ctx *ctx); /* Deallocates an htmlparser context object. */ void htmlparser_delete(htmlparser_ctx *ctx); #define htmlparser_parse_chr(a,b) htmlparser_parse(a, &(b), 1); #ifdef __cplusplus #define htmlparser_parse_str(a,b) htmlparser_parse(a, b, \ static_cast(strlen(b))); #else #define htmlparser_parse_str(a,b) htmlparser_parse(a, b, (int)strlen(b)); #endif #endif /* STREAMHTMLPARSER_HTMLPARSER_H */