diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 17:16:21 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 17:16:21 +0200 |
commit | 54cce110784d33d658b5f78286a98bee244a9eeb (patch) | |
tree | 9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/jsparser.c | |
parent | fcb682cb1955d362390665330fdf476cab7dc10b (diff) | |
download | crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2 |
added streamhtmlparser
Diffstat (limited to 'streamhtmlparser/jsparser.c')
-rw-r--r-- | streamhtmlparser/jsparser.c | 662 |
1 files changed, 662 insertions, 0 deletions
diff --git a/streamhtmlparser/jsparser.c b/streamhtmlparser/jsparser.c new file mode 100644 index 0000000..9d71c74 --- /dev/null +++ b/streamhtmlparser/jsparser.c @@ -0,0 +1,662 @@ +/* Copyright (c) 2007, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * --- + * Author: Filipe Almeida + */ + +#include "config.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#include "statemachine.h" +#include "jsparser.h" + +/* So we can support both C and C++ compilers, we use the CAST() macro instead + * of using C style casts or static_cast<>() directly. + */ +#ifdef __cplusplus + #define CAST(type, expression) (static_cast<type>(expression)) +#else + #define CAST(type, expression) ((type)(expression)) +#endif + +/* Generated state machine definition. */ +#include "jsparser_fsm.h" + +/* List of keywords that can precede a regular expression literal. Taken from: + * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html + * + * 'void' was added to this list. + * The list is used as input to a binary search, so it must be kept in a sorted + * form. + * There are a large number of keywords in here that don't exist in + * ecmascript 3, either because they are reserved or because they are part of + * ecmascript 4. However they weren't removed in order to keep the list in sync + * with the previous document. + */ +static const char *regexp_token_prefix[] = { + "abstract", + "break", + "case", + "catch", + "class", + "const", + "continue", + "debugger", + "default", + "delete", + "do", + "else", + "enum", + "eval", + "export", + "extends", + "field", + "final", + "finally", + "for", + "function", + "goto", + "if", + "implements", + "import", + "in", + "instanceof", + "native", + "new", + "package", + "private", + "protected", + "public", + "return", + "static", + "switch", + "synchronized", + "throw", + "throws", + "transient", + "try", + "typeof", + "var", + "void", + "volatile", + "while", + "with" +}; + +/* Utility functions */ + +/* Converts the internal state into the external superstate. + */ +static inline int state_external(int state) +{ + assert(state < JSPARSER_NUM_STATES); + assert(state >= 0); + return jsparser_states_external[state]; +} + +/* Returns true if the character is an ecmascript whitespace or line terminator + * character. Includes the characters in sections 7.2 and 7.3 of ECMAScript 3 + * with the exception of unicode space and line terminators: + * http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf + */ +static inline int js_is_whitespace(char c) +{ + return c == '\t' || /* Tab 0x09 */ + c == '\v' || /* Vertical Tab 0x0B */ + c == '\f' || /* Form Feed 0x0C */ + c == ' ' || /* Space 0x20 */ + c == '\xa0' || /* No-Break Space 0xA0 */ + c == '\n' || /* line Feed 0x0A */ + c == '\r'; /* Carriage Return 0x0D */ +} + +/* Returns true if the character is part of a javascript identifier. The rules + * are pretty relaxed here since we don't accept unicode and don't care if the + * first character doesn't contain numbers or not. + * + * For more detail on the limitations of having this relaxed set of characters + * please see the comments in_state_js_text(). + */ +static inline int js_is_identifier(char c) { + return (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || + c == '_' || + c == '$'; +} + +/* Appends a character to the ring buffer. Sequences of whitespace and newlines + * are folded into one. + * + * js->buffer_start points to the first character in the buffer and + * js->buffer_end points to the position of the next character to be written, + * or one plus the last character written. If the buffer is full there will be + * an empty slot position so we can diferentiate an empty buffer from a full + * buffer. + * + * If the buffer is empty, then: + * js->buffer_start == js->buffer_end. + * If the buffer is full, then: + * (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE == js->buffer_start. + * + */ +void jsparser_buffer_append_chr(jsparser_ctx *js, char chr) +{ + /* Fold whitespace so we have enough space in the buffer. */ + if (js_is_whitespace(chr) && + js_is_whitespace(jsparser_buffer_get(js, -1))) { + return; + } + + js->buffer[js->buffer_end] = chr; + js->buffer_end = (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE; + if (js->buffer_end == js->buffer_start) { + js->buffer_start = (js->buffer_end + 1) % + JSPARSER_RING_BUFFER_SIZE; + } +} + +/* Appends a string to the ring buffer. Sequences of whitespace and newlines + * are folded into one. + */ +void jsparser_buffer_append_str(jsparser_ctx *js, const char *str) +{ + assert(js != NULL); + assert(str != NULL); + + while(*str != '\0') { + jsparser_buffer_append_chr(js, *str++); + } +} + +/* Returns the position relative to the start of the buffer or -1 if past the + * size of the buffer.. + */ +static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos) +{ + int absolute_pos; + int buffer_len; + assert(pos < 0); + + if(pos <= -JSPARSER_RING_BUFFER_SIZE) { + return -1; + } + + buffer_len = js->buffer_end - js->buffer_start; + if (buffer_len < 0) { + buffer_len += JSPARSER_RING_BUFFER_SIZE; + } + + if (pos < -buffer_len) { + return -1; + } + + absolute_pos = (pos + js->buffer_end) % JSPARSER_RING_BUFFER_SIZE; + if (absolute_pos < 0) { + absolute_pos += JSPARSER_RING_BUFFER_SIZE; + } + + return absolute_pos; +} + +/* Returns the last appended character and removes it from the buffer. If the + * buffer is empty, then it returns ASCII 0 ('\0'). + */ +char jsparser_buffer_pop(jsparser_ctx *js) +{ + if (js->buffer_start == js->buffer_end) { + return '\0'; + } + + js->buffer_end--; + if (js->buffer_end < 0) { + js->buffer_end += JSPARSER_RING_BUFFER_SIZE; + } + + return js->buffer[js->buffer_end]; +} + +/* Returns the value of the character at a certain index in the buffer or an + * ASCII 0 ('\0') character if the index is outside the buffer boundaries. + * + * Index positions are negative, were -1 is the last character appended to the + * buffer. Using positive integers for the index will result in undefined + * behaviour. + */ +char jsparser_buffer_get(jsparser_ctx *js, int pos) +{ + int absolute_pos; + assert(pos < 0); + + absolute_pos = jsparser_buffer_absolute_pos(js, pos); + if (absolute_pos < 0) { + return '\0'; + } + + return js->buffer[absolute_pos]; +} + +/* Sets the value of the character at a certain index in the buffer. Returns + * true if the write was successful or false if there was an attempt to write + * outside of the buffer boundaries. + * + * Index positions are negative, were -1 is the last character appended to the + * buffer. Using positive integers for the index will result in undefined + * behaviour. + */ +int jsparser_buffer_set(jsparser_ctx *js, int pos, char value) +{ + int absolute_pos; + assert(pos < 0); + + absolute_pos = jsparser_buffer_absolute_pos(js, pos); + if (absolute_pos < 0) { + return 0; + } + + js->buffer[absolute_pos] = value; + return 1; +} + +/* Copies a slice of the buffer to the string pointed to by output. start and + * end are the indexes of the sliced region. If start extends beyond the + * beginning of the buffer, the slice will only contain character from the + * beginning of the buffer. + */ +void jsparser_buffer_slice(jsparser_ctx *js, char *output, int start, int end) +{ + int pos; + + assert(start <= end); + assert(start < 0); + assert(end < 0); + + for (pos = start; pos <= end; ++pos) { + char c; + c = jsparser_buffer_get(js, pos); + if (c != '\0') { + *output++ = jsparser_buffer_get(js, pos); + } + } + *output++ = '\0'; +} + +/* Copy the last javascript identifier or keyword found in the buffer to the + * string pointed by identifier. + * + * For simplicity, we consider an identifier to be a sequence of alphanumeric + * characters (as opposed to a digit followed by an alphanumeric character. + * + * Returns 0 if no identifier was matched, in which case the identifier + * argument is replaced with an empty string, or non 0 if the identifier was + * found. + */ +int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier) +{ + int end; + int pos; + + assert(identifier != NULL); + + end = -1; + /* Ignore the optional whitespace delimiter */ + if (js_is_whitespace(jsparser_buffer_get(js, -1))) { + --end; + } + + /* Find the beginning of the identifier. This loop ends either when we find a + * character that doesn't belong to an identifier, or when we find a '\0' + * character, which means we reached the end of the buffer. */ + for(pos = end; js_is_identifier(jsparser_buffer_get(js, pos)); --pos) { + } + if (pos + 1 >= end) { + identifier[0] = '\0'; + return 0; + } + + jsparser_buffer_slice(js, identifier, pos + 1, end); + return 1; +} + +/* Callback used in bsearch() for comparing a string against an array of + * strings. + */ +static int bsearch_strcmp(const void *a, const void *b) +{ + return strcmp(CAST(const char*, a), *CAST(const char * const *, b)); +} + +/* Returns true if the token argument can be a token prefix to a javascript + * regular expression. + * + * The token argument is compared against a list of identifiers that can + * precede a regular expression in the javascript grammar, and returns true if + * the argument is found on that list. + */ +static inline int is_regexp_token_prefix(char *token) +{ + assert(token != NULL); + + return bsearch(token, + regexp_token_prefix, + sizeof(regexp_token_prefix) / sizeof(char *), + sizeof(char *), bsearch_strcmp) != NULL; +} + +/* Called for every character in state text. + * + * We copy every character we find when we are in state text to the ring + * buffer. This has the side effect of also pushing slash characters that are + * part of comments into the buffer, although for parsing purposes these should + * be treated as whitespace. This issue is addressed in + * enter_state_js_comment_ml_after(). + */ +static void in_state_js_text(statemachine_ctx *ctx, int start, char chr, + int end) +{ + jsparser_ctx *js = CAST(jsparser_ctx *, ctx->user); + assert(js != NULL); + + (void)start; + (void)end; + + jsparser_buffer_append_chr(js, chr); +} + +/* This function is called every time we find a slash ('/') character in the + * javascript text (except for slashes that close comments or regexp literals). + * + * Implements the logic to figure out if this slash character is a division + * operator or if it opens a regular expression literal. This is heavily + * inspired by the syntactic resynchronization for javascript 2.0: + * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html + * + * When we receive a '/', we look at the previous non space character to figure + * out if it's the ending of a punctuator that can precede a regexp literal, in + * which case we assume the current '/' is part of a regular expression literal + * (or the opening of a javascript comment, but that part is dealt with in the + * state machine). The exceptions to this are unary operators, so we look back + * a second character to rule out '++' and '--'. Although it is not + * straightforward to figure out if the binary operator is a postfix of the + * previous expression or a prefix of the regular expression, we rule out the + * later as it is an uncommon practice. + * + * If we ruled out the previous token to be a valid regexp preceding + * punctuator, we extract the last identifier in the buffer and match against a + * list of keywords that are known to precede expressions in the grammar. If we + * get a match on any of these keywords, then we are opening a regular + * expression, if not, then we have a division operator. + * + * Known cases that are accepted by the grammar but we handle differently, + * although I don't believe there is a legitimate usage for those: + * + * Division of a regular expression: + * var result = /test/ / 5; + * + * Prefix unary increment of a regular expression: + * var result = ++/test/; + * + * Division of an object literal: + * { a: 1 } /x/.exec('x'); + * + * We only support ascii right now, so unicode characters in identifiers will + * be treated as delimiters, effectively breaking the identifier name where + * they appear, and this may cause issues in very specific situations. Namely, + * if we have a unicode character in an identifier directly preceding a suffix + * that matches one of the keywords in regexp_token_prefix[], if this + * identifier precedes a / (slash) character: + * + * var x = test<unicode char>return / 5; + * + * We will interpret that slash as the start of a regular expression, when in + * reality it is a division operator. + */ +static void enter_state_js_slash(statemachine_ctx *ctx, int start, char chr, + int end) +{ + jsparser_ctx *js; + char buffer[JSPARSER_RING_BUFFER_SIZE]; + int pos; + + (void)start; + (void)end; + + assert(ctx != NULL); + assert(ctx->user != NULL); + + js = CAST(jsparser_ctx *, ctx->user); + assert(js != NULL); + + pos = -1; + /* Consume the last whitespace. */ + if (js_is_whitespace(jsparser_buffer_get(js, pos))) { + --pos; + } + + switch (jsparser_buffer_get(js, pos)) { + /* Ignore unary increment */ + case '+': + if (jsparser_buffer_get(js, pos - 1) != '+') { + ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH; + } + break; + + /* Ignore unary decrement */ + case '-': + if (jsparser_buffer_get(js, pos - 1) != '-') { + ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH; + } + break; + + /* List of punctuator endings except ), ], }, + and - */ + case '=': + case '<': + case '>': + case '&': + case '|': + case '!': + case '%': + case '*': + case '/': + case ',': + case ';': + case '?': + case ':': + case '^': + case '~': + case '{': + case '(': + case '[': + case '}': + case '\0': + ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH; + break; + + default: + if (jsparser_buffer_last_identifier(js, buffer) && + is_regexp_token_prefix(buffer)) { + ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH; + } + } + + jsparser_buffer_append_chr(js, chr); +} + +/* Called at the end of a javascript comment. + * + * When we open a comment, the initial '/' was inserted into the ring buffer, + * but it is not a token and should be considered whitespace for parsing + * purposes. + * + * When we first saw the '/' character, we didn't yet know if it was the + * beginning of a comment, a division operator, or a regexp. + * + * In this function we just replace the inital '/' with a whitespace character, + * unless we had a preceding whitespace character, in which case we just remove + * the '/'. This is needed to ensure all spaces in the buffer are correctly + * folded. + */ +static void enter_state_js_comment_after(statemachine_ctx *ctx, int start, + char chr, int end) +{ + jsparser_ctx *js; + + assert(ctx != NULL); + assert(ctx->user != NULL); + + (void)start; + (void)chr; + (void)end; + + js = CAST(jsparser_ctx *, ctx->user); + + if (js_is_whitespace(jsparser_buffer_get(js, -2))) { + (void)jsparser_buffer_pop(js); + } else { + jsparser_buffer_set(js, -1, ' '); + } +} + +static statemachine_definition *create_statemachine_definition(void) +{ + statemachine_definition *def; + def = statemachine_definition_new(JSPARSER_NUM_STATES); + if (def == NULL) + return NULL; + + /* TODO(falmeida): Check return value. */ + statemachine_definition_populate(def, jsparser_state_transitions, + jsparser_states_internal_names); + + statemachine_in_state(def, JSPARSER_STATE_INT_JS_TEXT, + in_state_js_text); + statemachine_enter_state(def, JSPARSER_STATE_INT_JS_SLASH, + enter_state_js_slash); + statemachine_enter_state(def, JSPARSER_STATE_INT_JS_COMMENT_AFTER, + enter_state_js_comment_after); + + return def; +} + + +/* Initializes a new jsparser instance. + * + * Returns a pointer to the new instance or NULL if the initialization + * fails. + * + * Initialization failure is fatal, and if this function fails it may not + * deallocate all previsouly allocated memory. + */ + +jsparser_ctx *jsparser_new(void) +{ + jsparser_ctx *js; + + js = CAST(jsparser_ctx *, calloc(1, sizeof(jsparser_ctx))); + if (js == NULL) + return NULL; + + js->statemachine_def = create_statemachine_definition(); + if (js->statemachine_def == NULL) + return NULL; + + js->statemachine = statemachine_new(js->statemachine_def, js); + if (js->statemachine == NULL) + return NULL; + + jsparser_reset(js); + + return js; +} + +/* Returns a pointer to a context which is a duplicate of the jsparser src. + */ +jsparser_ctx *jsparser_duplicate(jsparser_ctx *src) +{ + jsparser_ctx *dst; + assert(src != NULL); + + dst = jsparser_new(); + if (dst == NULL) + return NULL; + + jsparser_copy(dst, src); + + return dst; +} + +/* Copies the context of the jsparser pointed to by src to the jsparser dst. + * + * The state machine definition is preserved since it is read only. + */ +void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src) +{ + + dst->buffer_start = src->buffer_start; + dst->buffer_end = src->buffer_end; + memcpy(dst->buffer, src->buffer, sizeof(src->buffer)); + + statemachine_copy(dst->statemachine, + src->statemachine, + dst->statemachine_def, + dst); + +} + +void jsparser_reset(jsparser_ctx *ctx) +{ + assert(ctx != NULL); + ctx->statemachine->current_state = 0; + ctx->buffer_start = 0; + ctx->buffer_end = 0; +} + +int jsparser_state(jsparser_ctx *ctx) +{ + return state_external(ctx->statemachine->current_state); +} + +int jsparser_parse(jsparser_ctx *ctx, const char *str, int size) +{ + int internal_state; + internal_state = statemachine_parse(ctx->statemachine, str, size); + return state_external(internal_state); +} + +void jsparser_delete(jsparser_ctx *ctx) +{ + assert(ctx != NULL); + statemachine_delete(ctx->statemachine); + statemachine_definition_delete(ctx->statemachine_def); + free(ctx); +} |