summaryrefslogtreecommitdiff
path: root/streamhtmlparser/jsparser.c
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-14 17:16:21 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-14 17:16:21 +0200
commit54cce110784d33d658b5f78286a98bee244a9eeb (patch)
tree9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/jsparser.c
parentfcb682cb1955d362390665330fdf476cab7dc10b (diff)
downloadcrawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz
crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2
added streamhtmlparser
Diffstat (limited to 'streamhtmlparser/jsparser.c')
-rw-r--r--streamhtmlparser/jsparser.c662
1 files changed, 662 insertions, 0 deletions
diff --git a/streamhtmlparser/jsparser.c b/streamhtmlparser/jsparser.c
new file mode 100644
index 0000000..9d71c74
--- /dev/null
+++ b/streamhtmlparser/jsparser.c
@@ -0,0 +1,662 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Filipe Almeida
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "statemachine.h"
+#include "jsparser.h"
+
+/* So we can support both C and C++ compilers, we use the CAST() macro instead
+ * of using C style casts or static_cast<>() directly.
+ */
+#ifdef __cplusplus
+ #define CAST(type, expression) (static_cast<type>(expression))
+#else
+ #define CAST(type, expression) ((type)(expression))
+#endif
+
+/* Generated state machine definition. */
+#include "jsparser_fsm.h"
+
+/* List of keywords that can precede a regular expression literal. Taken from:
+ * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
+ *
+ * 'void' was added to this list.
+ * The list is used as input to a binary search, so it must be kept in a sorted
+ * form.
+ * There are a large number of keywords in here that don't exist in
+ * ecmascript 3, either because they are reserved or because they are part of
+ * ecmascript 4. However they weren't removed in order to keep the list in sync
+ * with the previous document.
+ */
+static const char *regexp_token_prefix[] = {
+ "abstract",
+ "break",
+ "case",
+ "catch",
+ "class",
+ "const",
+ "continue",
+ "debugger",
+ "default",
+ "delete",
+ "do",
+ "else",
+ "enum",
+ "eval",
+ "export",
+ "extends",
+ "field",
+ "final",
+ "finally",
+ "for",
+ "function",
+ "goto",
+ "if",
+ "implements",
+ "import",
+ "in",
+ "instanceof",
+ "native",
+ "new",
+ "package",
+ "private",
+ "protected",
+ "public",
+ "return",
+ "static",
+ "switch",
+ "synchronized",
+ "throw",
+ "throws",
+ "transient",
+ "try",
+ "typeof",
+ "var",
+ "void",
+ "volatile",
+ "while",
+ "with"
+};
+
+/* Utility functions */
+
+/* Converts the internal state into the external superstate.
+ */
+static inline int state_external(int state)
+{
+ assert(state < JSPARSER_NUM_STATES);
+ assert(state >= 0);
+ return jsparser_states_external[state];
+}
+
+/* Returns true if the character is an ecmascript whitespace or line terminator
+ * character. Includes the characters in sections 7.2 and 7.3 of ECMAScript 3
+ * with the exception of unicode space and line terminators:
+ * http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
+ */
+static inline int js_is_whitespace(char c)
+{
+ return c == '\t' || /* Tab 0x09 */
+ c == '\v' || /* Vertical Tab 0x0B */
+ c == '\f' || /* Form Feed 0x0C */
+ c == ' ' || /* Space 0x20 */
+ c == '\xa0' || /* No-Break Space 0xA0 */
+ c == '\n' || /* line Feed 0x0A */
+ c == '\r'; /* Carriage Return 0x0D */
+}
+
+/* Returns true if the character is part of a javascript identifier. The rules
+ * are pretty relaxed here since we don't accept unicode and don't care if the
+ * first character doesn't contain numbers or not.
+ *
+ * For more detail on the limitations of having this relaxed set of characters
+ * please see the comments in_state_js_text().
+ */
+static inline int js_is_identifier(char c) {
+ return (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9') ||
+ c == '_' ||
+ c == '$';
+}
+
+/* Appends a character to the ring buffer. Sequences of whitespace and newlines
+ * are folded into one.
+ *
+ * js->buffer_start points to the first character in the buffer and
+ * js->buffer_end points to the position of the next character to be written,
+ * or one plus the last character written. If the buffer is full there will be
+ * an empty slot position so we can diferentiate an empty buffer from a full
+ * buffer.
+ *
+ * If the buffer is empty, then:
+ * js->buffer_start == js->buffer_end.
+ * If the buffer is full, then:
+ * (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE == js->buffer_start.
+ *
+ */
+void jsparser_buffer_append_chr(jsparser_ctx *js, char chr)
+{
+ /* Fold whitespace so we have enough space in the buffer. */
+ if (js_is_whitespace(chr) &&
+ js_is_whitespace(jsparser_buffer_get(js, -1))) {
+ return;
+ }
+
+ js->buffer[js->buffer_end] = chr;
+ js->buffer_end = (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE;
+ if (js->buffer_end == js->buffer_start) {
+ js->buffer_start = (js->buffer_end + 1) %
+ JSPARSER_RING_BUFFER_SIZE;
+ }
+}
+
+/* Appends a string to the ring buffer. Sequences of whitespace and newlines
+ * are folded into one.
+ */
+void jsparser_buffer_append_str(jsparser_ctx *js, const char *str)
+{
+ assert(js != NULL);
+ assert(str != NULL);
+
+ while(*str != '\0') {
+ jsparser_buffer_append_chr(js, *str++);
+ }
+}
+
+/* Returns the position relative to the start of the buffer or -1 if past the
+ * size of the buffer..
+ */
+static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos)
+{
+ int absolute_pos;
+ int buffer_len;
+ assert(pos < 0);
+
+ if(pos <= -JSPARSER_RING_BUFFER_SIZE) {
+ return -1;
+ }
+
+ buffer_len = js->buffer_end - js->buffer_start;
+ if (buffer_len < 0) {
+ buffer_len += JSPARSER_RING_BUFFER_SIZE;
+ }
+
+ if (pos < -buffer_len) {
+ return -1;
+ }
+
+ absolute_pos = (pos + js->buffer_end) % JSPARSER_RING_BUFFER_SIZE;
+ if (absolute_pos < 0) {
+ absolute_pos += JSPARSER_RING_BUFFER_SIZE;
+ }
+
+ return absolute_pos;
+}
+
+/* Returns the last appended character and removes it from the buffer. If the
+ * buffer is empty, then it returns ASCII 0 ('\0').
+ */
+char jsparser_buffer_pop(jsparser_ctx *js)
+{
+ if (js->buffer_start == js->buffer_end) {
+ return '\0';
+ }
+
+ js->buffer_end--;
+ if (js->buffer_end < 0) {
+ js->buffer_end += JSPARSER_RING_BUFFER_SIZE;
+ }
+
+ return js->buffer[js->buffer_end];
+}
+
+/* Returns the value of the character at a certain index in the buffer or an
+ * ASCII 0 ('\0') character if the index is outside the buffer boundaries.
+ *
+ * Index positions are negative, were -1 is the last character appended to the
+ * buffer. Using positive integers for the index will result in undefined
+ * behaviour.
+ */
+char jsparser_buffer_get(jsparser_ctx *js, int pos)
+{
+ int absolute_pos;
+ assert(pos < 0);
+
+ absolute_pos = jsparser_buffer_absolute_pos(js, pos);
+ if (absolute_pos < 0) {
+ return '\0';
+ }
+
+ return js->buffer[absolute_pos];
+}
+
+/* Sets the value of the character at a certain index in the buffer. Returns
+ * true if the write was successful or false if there was an attempt to write
+ * outside of the buffer boundaries.
+ *
+ * Index positions are negative, were -1 is the last character appended to the
+ * buffer. Using positive integers for the index will result in undefined
+ * behaviour.
+ */
+int jsparser_buffer_set(jsparser_ctx *js, int pos, char value)
+{
+ int absolute_pos;
+ assert(pos < 0);
+
+ absolute_pos = jsparser_buffer_absolute_pos(js, pos);
+ if (absolute_pos < 0) {
+ return 0;
+ }
+
+ js->buffer[absolute_pos] = value;
+ return 1;
+}
+
+/* Copies a slice of the buffer to the string pointed to by output. start and
+ * end are the indexes of the sliced region. If start extends beyond the
+ * beginning of the buffer, the slice will only contain character from the
+ * beginning of the buffer.
+ */
+void jsparser_buffer_slice(jsparser_ctx *js, char *output, int start, int end)
+{
+ int pos;
+
+ assert(start <= end);
+ assert(start < 0);
+ assert(end < 0);
+
+ for (pos = start; pos <= end; ++pos) {
+ char c;
+ c = jsparser_buffer_get(js, pos);
+ if (c != '\0') {
+ *output++ = jsparser_buffer_get(js, pos);
+ }
+ }
+ *output++ = '\0';
+}
+
+/* Copy the last javascript identifier or keyword found in the buffer to the
+ * string pointed by identifier.
+ *
+ * For simplicity, we consider an identifier to be a sequence of alphanumeric
+ * characters (as opposed to a digit followed by an alphanumeric character.
+ *
+ * Returns 0 if no identifier was matched, in which case the identifier
+ * argument is replaced with an empty string, or non 0 if the identifier was
+ * found.
+ */
+int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier)
+{
+ int end;
+ int pos;
+
+ assert(identifier != NULL);
+
+ end = -1;
+ /* Ignore the optional whitespace delimiter */
+ if (js_is_whitespace(jsparser_buffer_get(js, -1))) {
+ --end;
+ }
+
+ /* Find the beginning of the identifier. This loop ends either when we find a
+ * character that doesn't belong to an identifier, or when we find a '\0'
+ * character, which means we reached the end of the buffer. */
+ for(pos = end; js_is_identifier(jsparser_buffer_get(js, pos)); --pos) {
+ }
+ if (pos + 1 >= end) {
+ identifier[0] = '\0';
+ return 0;
+ }
+
+ jsparser_buffer_slice(js, identifier, pos + 1, end);
+ return 1;
+}
+
+/* Callback used in bsearch() for comparing a string against an array of
+ * strings.
+ */
+static int bsearch_strcmp(const void *a, const void *b)
+{
+ return strcmp(CAST(const char*, a), *CAST(const char * const *, b));
+}
+
+/* Returns true if the token argument can be a token prefix to a javascript
+ * regular expression.
+ *
+ * The token argument is compared against a list of identifiers that can
+ * precede a regular expression in the javascript grammar, and returns true if
+ * the argument is found on that list.
+ */
+static inline int is_regexp_token_prefix(char *token)
+{
+ assert(token != NULL);
+
+ return bsearch(token,
+ regexp_token_prefix,
+ sizeof(regexp_token_prefix) / sizeof(char *),
+ sizeof(char *), bsearch_strcmp) != NULL;
+}
+
+/* Called for every character in state text.
+ *
+ * We copy every character we find when we are in state text to the ring
+ * buffer. This has the side effect of also pushing slash characters that are
+ * part of comments into the buffer, although for parsing purposes these should
+ * be treated as whitespace. This issue is addressed in
+ * enter_state_js_comment_ml_after().
+ */
+static void in_state_js_text(statemachine_ctx *ctx, int start, char chr,
+ int end)
+{
+ jsparser_ctx *js = CAST(jsparser_ctx *, ctx->user);
+ assert(js != NULL);
+
+ (void)start;
+ (void)end;
+
+ jsparser_buffer_append_chr(js, chr);
+}
+
+/* This function is called every time we find a slash ('/') character in the
+ * javascript text (except for slashes that close comments or regexp literals).
+ *
+ * Implements the logic to figure out if this slash character is a division
+ * operator or if it opens a regular expression literal. This is heavily
+ * inspired by the syntactic resynchronization for javascript 2.0:
+ * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
+ *
+ * When we receive a '/', we look at the previous non space character to figure
+ * out if it's the ending of a punctuator that can precede a regexp literal, in
+ * which case we assume the current '/' is part of a regular expression literal
+ * (or the opening of a javascript comment, but that part is dealt with in the
+ * state machine). The exceptions to this are unary operators, so we look back
+ * a second character to rule out '++' and '--'. Although it is not
+ * straightforward to figure out if the binary operator is a postfix of the
+ * previous expression or a prefix of the regular expression, we rule out the
+ * later as it is an uncommon practice.
+ *
+ * If we ruled out the previous token to be a valid regexp preceding
+ * punctuator, we extract the last identifier in the buffer and match against a
+ * list of keywords that are known to precede expressions in the grammar. If we
+ * get a match on any of these keywords, then we are opening a regular
+ * expression, if not, then we have a division operator.
+ *
+ * Known cases that are accepted by the grammar but we handle differently,
+ * although I don't believe there is a legitimate usage for those:
+ *
+ * Division of a regular expression:
+ * var result = /test/ / 5;
+ *
+ * Prefix unary increment of a regular expression:
+ * var result = ++/test/;
+ *
+ * Division of an object literal:
+ * { a: 1 } /x/.exec('x');
+ *
+ * We only support ascii right now, so unicode characters in identifiers will
+ * be treated as delimiters, effectively breaking the identifier name where
+ * they appear, and this may cause issues in very specific situations. Namely,
+ * if we have a unicode character in an identifier directly preceding a suffix
+ * that matches one of the keywords in regexp_token_prefix[], if this
+ * identifier precedes a / (slash) character:
+ *
+ * var x = test<unicode char>return / 5;
+ *
+ * We will interpret that slash as the start of a regular expression, when in
+ * reality it is a division operator.
+ */
+static void enter_state_js_slash(statemachine_ctx *ctx, int start, char chr,
+ int end)
+{
+ jsparser_ctx *js;
+ char buffer[JSPARSER_RING_BUFFER_SIZE];
+ int pos;
+
+ (void)start;
+ (void)end;
+
+ assert(ctx != NULL);
+ assert(ctx->user != NULL);
+
+ js = CAST(jsparser_ctx *, ctx->user);
+ assert(js != NULL);
+
+ pos = -1;
+ /* Consume the last whitespace. */
+ if (js_is_whitespace(jsparser_buffer_get(js, pos))) {
+ --pos;
+ }
+
+ switch (jsparser_buffer_get(js, pos)) {
+ /* Ignore unary increment */
+ case '+':
+ if (jsparser_buffer_get(js, pos - 1) != '+') {
+ ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
+ }
+ break;
+
+ /* Ignore unary decrement */
+ case '-':
+ if (jsparser_buffer_get(js, pos - 1) != '-') {
+ ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
+ }
+ break;
+
+ /* List of punctuator endings except ), ], }, + and - */
+ case '=':
+ case '<':
+ case '>':
+ case '&':
+ case '|':
+ case '!':
+ case '%':
+ case '*':
+ case '/':
+ case ',':
+ case ';':
+ case '?':
+ case ':':
+ case '^':
+ case '~':
+ case '{':
+ case '(':
+ case '[':
+ case '}':
+ case '\0':
+ ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
+ break;
+
+ default:
+ if (jsparser_buffer_last_identifier(js, buffer) &&
+ is_regexp_token_prefix(buffer)) {
+ ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
+ }
+ }
+
+ jsparser_buffer_append_chr(js, chr);
+}
+
+/* Called at the end of a javascript comment.
+ *
+ * When we open a comment, the initial '/' was inserted into the ring buffer,
+ * but it is not a token and should be considered whitespace for parsing
+ * purposes.
+ *
+ * When we first saw the '/' character, we didn't yet know if it was the
+ * beginning of a comment, a division operator, or a regexp.
+ *
+ * In this function we just replace the inital '/' with a whitespace character,
+ * unless we had a preceding whitespace character, in which case we just remove
+ * the '/'. This is needed to ensure all spaces in the buffer are correctly
+ * folded.
+ */
+static void enter_state_js_comment_after(statemachine_ctx *ctx, int start,
+ char chr, int end)
+{
+ jsparser_ctx *js;
+
+ assert(ctx != NULL);
+ assert(ctx->user != NULL);
+
+ (void)start;
+ (void)chr;
+ (void)end;
+
+ js = CAST(jsparser_ctx *, ctx->user);
+
+ if (js_is_whitespace(jsparser_buffer_get(js, -2))) {
+ (void)jsparser_buffer_pop(js);
+ } else {
+ jsparser_buffer_set(js, -1, ' ');
+ }
+}
+
+static statemachine_definition *create_statemachine_definition(void)
+{
+ statemachine_definition *def;
+ def = statemachine_definition_new(JSPARSER_NUM_STATES);
+ if (def == NULL)
+ return NULL;
+
+ /* TODO(falmeida): Check return value. */
+ statemachine_definition_populate(def, jsparser_state_transitions,
+ jsparser_states_internal_names);
+
+ statemachine_in_state(def, JSPARSER_STATE_INT_JS_TEXT,
+ in_state_js_text);
+ statemachine_enter_state(def, JSPARSER_STATE_INT_JS_SLASH,
+ enter_state_js_slash);
+ statemachine_enter_state(def, JSPARSER_STATE_INT_JS_COMMENT_AFTER,
+ enter_state_js_comment_after);
+
+ return def;
+}
+
+
+/* Initializes a new jsparser instance.
+ *
+ * Returns a pointer to the new instance or NULL if the initialization
+ * fails.
+ *
+ * Initialization failure is fatal, and if this function fails it may not
+ * deallocate all previsouly allocated memory.
+ */
+
+jsparser_ctx *jsparser_new(void)
+{
+ jsparser_ctx *js;
+
+ js = CAST(jsparser_ctx *, calloc(1, sizeof(jsparser_ctx)));
+ if (js == NULL)
+ return NULL;
+
+ js->statemachine_def = create_statemachine_definition();
+ if (js->statemachine_def == NULL)
+ return NULL;
+
+ js->statemachine = statemachine_new(js->statemachine_def, js);
+ if (js->statemachine == NULL)
+ return NULL;
+
+ jsparser_reset(js);
+
+ return js;
+}
+
+/* Returns a pointer to a context which is a duplicate of the jsparser src.
+ */
+jsparser_ctx *jsparser_duplicate(jsparser_ctx *src)
+{
+ jsparser_ctx *dst;
+ assert(src != NULL);
+
+ dst = jsparser_new();
+ if (dst == NULL)
+ return NULL;
+
+ jsparser_copy(dst, src);
+
+ return dst;
+}
+
+/* Copies the context of the jsparser pointed to by src to the jsparser dst.
+ *
+ * The state machine definition is preserved since it is read only.
+ */
+void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src)
+{
+
+ dst->buffer_start = src->buffer_start;
+ dst->buffer_end = src->buffer_end;
+ memcpy(dst->buffer, src->buffer, sizeof(src->buffer));
+
+ statemachine_copy(dst->statemachine,
+ src->statemachine,
+ dst->statemachine_def,
+ dst);
+
+}
+
+void jsparser_reset(jsparser_ctx *ctx)
+{
+ assert(ctx != NULL);
+ ctx->statemachine->current_state = 0;
+ ctx->buffer_start = 0;
+ ctx->buffer_end = 0;
+}
+
+int jsparser_state(jsparser_ctx *ctx)
+{
+ return state_external(ctx->statemachine->current_state);
+}
+
+int jsparser_parse(jsparser_ctx *ctx, const char *str, int size)
+{
+ int internal_state;
+ internal_state = statemachine_parse(ctx->statemachine, str, size);
+ return state_external(internal_state);
+}
+
+void jsparser_delete(jsparser_ctx *ctx)
+{
+ assert(ctx != NULL);
+ statemachine_delete(ctx->statemachine);
+ statemachine_definition_delete(ctx->statemachine_def);
+ free(ctx);
+}