added streamhtmlparser

author: Andreas Baumann <abaumann@yahoo.com> 2012-07-14 17:16:21 +0200
committer: Andreas Baumann <abaumann@yahoo.com> 2012-07-14 17:16:21 +0200
commit: 54cce110784d33d658b5f78286a98bee244a9eeb (patch)
tree: 9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/jsparser.c
parent: fcb682cb1955d362390665330fdf476cab7dc10b (diff)
download: crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz
crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2
1 files changed, 662 insertions, 0 deletions
diff --git a/streamhtmlparser/jsparser.c b/streamhtmlparser/jsparser.c
new file mode 100644
index 0000000..9d71c74
--- /dev/null
+++ b/streamhtmlparser/jsparser.c
@@ -0,0 +1,662 @@
+/* Copyright (c) 2007, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Filipe Almeida
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "statemachine.h"
+#include "jsparser.h"
+
+/* So we can support both C and C++ compilers, we use the CAST() macro instead
+ * of using C style casts or static_cast<>() directly.
+ */
+#ifdef __cplusplus
+  #define CAST(type, expression) (static_cast<type>(expression))
+#else
+  #define CAST(type, expression) ((type)(expression))
+#endif
+
+/* Generated state machine definition. */
+#include "jsparser_fsm.h"
+
+/* List of keywords that can precede a regular expression literal. Taken from:
+ * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
+ *
+ * 'void' was added to this list.
+ * The list is used as input to a binary search, so it must be kept in a sorted
+ * form.
+ * There are a large number of keywords in here that don't exist in
+ * ecmascript 3, either because they are reserved or because they are part of
+ * ecmascript 4. However they weren't removed in order to keep the list in sync
+ * with the previous document.
+ */
+static const char *regexp_token_prefix[] = {
+  "abstract",
+  "break",
+  "case",
+  "catch",
+  "class",
+  "const",
+  "continue",
+  "debugger",
+  "default",
+  "delete",
+  "do",
+  "else",
+  "enum",
+  "eval",
+  "export",
+  "extends",
+  "field",
+  "final",
+  "finally",
+  "for",
+  "function",
+  "goto",
+  "if",
+  "implements",
+  "import",
+  "in",
+  "instanceof",
+  "native",
+  "new",
+  "package",
+  "private",
+  "protected",
+  "public",
+  "return",
+  "static",
+  "switch",
+  "synchronized",
+  "throw",
+  "throws",
+  "transient",
+  "try",
+  "typeof",
+  "var",
+  "void",
+  "volatile",
+  "while",
+  "with"
+};
+
+/* Utility functions */
+
+/* Converts the internal state into the external superstate.
+ */
+static inline int state_external(int state)
+{
+  assert(state < JSPARSER_NUM_STATES);
+  assert(state >= 0);
+  return jsparser_states_external[state];
+}
+
+/* Returns true if the character is an ecmascript whitespace or line terminator
+ * character. Includes the characters in sections 7.2 and 7.3 of ECMAScript 3
+ * with the exception of unicode space and line terminators:
+ * http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
+ */
+static inline int js_is_whitespace(char c)
+{
+  return c == '\t' ||   /* Tab 0x09 */
+         c == '\v' ||   /* Vertical Tab 0x0B */
+         c == '\f' ||   /* Form Feed 0x0C */
+         c == ' ' ||    /* Space 0x20 */
+         c == '\xa0' || /* No-Break Space 0xA0 */
+         c == '\n' ||   /* line Feed 0x0A */
+         c == '\r';     /* Carriage Return 0x0D */
+}
+
+/* Returns true if the character is part of a javascript identifier. The rules
+ * are pretty relaxed here since we don't accept unicode and don't care if the
+ * first character doesn't contain numbers or not.
+ *
+ * For more detail on the limitations of having this relaxed set of characters
+ * please see the comments in_state_js_text().
+ */
+static inline int js_is_identifier(char c) {
+  return (c >= 'a' && c <= 'z') ||
+         (c >= 'A' && c <= 'Z') ||
+         (c >= '0' && c <= '9') ||
+         c == '_' ||
+         c == '$';
+}
+
+/* Appends a character to the ring buffer. Sequences of whitespace and newlines
+ * are folded into one.
+ *
+ * js->buffer_start points to the first character in the buffer and
+ * js->buffer_end points to the position of the next character to be written,
+ * or one plus the last character written. If the buffer is full there will be
+ * an empty slot position so we can diferentiate an empty buffer from a full
+ * buffer.
+ *
+ * If the buffer is empty, then:
+ *   js->buffer_start == js->buffer_end.
+ * If the buffer is full, then:
+ *   (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE == js->buffer_start.
+ *
+ */
+void jsparser_buffer_append_chr(jsparser_ctx *js, char chr)
+{
+  /* Fold whitespace so we have enough space in the buffer. */
+  if (js_is_whitespace(chr) &&
+      js_is_whitespace(jsparser_buffer_get(js, -1))) {
+    return;
+  }
+
+  js->buffer[js->buffer_end] = chr;
+  js->buffer_end = (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE;
+  if (js->buffer_end == js->buffer_start) {
+    js->buffer_start = (js->buffer_end + 1) %
+        JSPARSER_RING_BUFFER_SIZE;
+  }
+}
+
+/* Appends a string to the ring buffer. Sequences of whitespace and newlines
+ * are folded into one.
+ */
+void jsparser_buffer_append_str(jsparser_ctx *js, const char *str)
+{
+  assert(js != NULL);
+  assert(str != NULL);
+
+  while(*str != '\0') {
+    jsparser_buffer_append_chr(js, *str++);
+  }
+}
+
+/* Returns the position relative to the start of the buffer or -1 if past the
+ * size of the buffer..
+ */
+static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos)
+{
+  int absolute_pos;
+  int buffer_len;
+  assert(pos < 0);
+
+  if(pos <= -JSPARSER_RING_BUFFER_SIZE) {
+    return -1;
+  }
+
+  buffer_len = js->buffer_end - js->buffer_start;
+  if (buffer_len < 0) {
+    buffer_len += JSPARSER_RING_BUFFER_SIZE;
+  }
+
+  if (pos < -buffer_len) {
+    return -1;
+  }
+
+  absolute_pos = (pos + js->buffer_end) % JSPARSER_RING_BUFFER_SIZE;
+  if (absolute_pos < 0) {
+    absolute_pos += JSPARSER_RING_BUFFER_SIZE;
+  }
+
+  return absolute_pos;
+}
+
+/* Returns the last appended character and removes it from the buffer. If the
+ * buffer is empty, then it returns ASCII 0 ('\0').
+ */
+char jsparser_buffer_pop(jsparser_ctx *js)
+{
+  if (js->buffer_start == js->buffer_end) {
+    return '\0';
+  }
+
+  js->buffer_end--;
+  if (js->buffer_end < 0) {
+    js->buffer_end += JSPARSER_RING_BUFFER_SIZE;
+  }
+
+  return js->buffer[js->buffer_end];
+}
+
+/* Returns the value of the character at a certain index in the buffer or an
+ * ASCII 0 ('\0') character if the index is outside the buffer boundaries.
+ *
+ * Index positions are negative, were -1 is the last character appended to the
+ * buffer. Using positive integers for the index will result in undefined
+ * behaviour.
+ */
+char jsparser_buffer_get(jsparser_ctx *js, int pos)
+{
+  int absolute_pos;
+  assert(pos < 0);
+
+  absolute_pos = jsparser_buffer_absolute_pos(js, pos);
+  if (absolute_pos < 0) {
+    return '\0';
+  }
+
+  return js->buffer[absolute_pos];
+}
+
+/* Sets the value of the character at a certain index in the buffer. Returns
+ * true if the write was successful or false if there was an attempt to write
+ * outside of the buffer boundaries.
+ *
+ * Index positions are negative, were -1 is the last character appended to the
+ * buffer. Using positive integers for the index will result in undefined
+ * behaviour.
+ */
+int jsparser_buffer_set(jsparser_ctx *js, int pos, char value)
+{
+  int absolute_pos;
+  assert(pos < 0);
+
+  absolute_pos = jsparser_buffer_absolute_pos(js, pos);
+  if (absolute_pos < 0) {
+    return 0;
+  }
+
+  js->buffer[absolute_pos] = value;
+  return 1;
+}
+
+/* Copies a slice of the buffer to the string pointed to by output. start and
+ * end are the indexes of the sliced region. If start extends beyond the
+ * beginning of the buffer, the slice will only contain character from the
+ * beginning of the buffer.
+ */
+void jsparser_buffer_slice(jsparser_ctx *js, char *output, int start, int end)
+{
+  int pos;
+
+  assert(start <= end);
+  assert(start < 0);
+  assert(end < 0);
+
+  for (pos = start; pos <= end; ++pos) {
+    char c;
+    c = jsparser_buffer_get(js, pos);
+    if (c != '\0') {
+      *output++ = jsparser_buffer_get(js, pos);
+    }
+  }
+  *output++ = '\0';
+}
+
+/* Copy the last javascript identifier or keyword found in the buffer to the
+ * string pointed by identifier.
+ *
+ * For simplicity, we consider an identifier to be a sequence of alphanumeric
+ * characters (as opposed to a digit followed by an alphanumeric character.
+ *
+ * Returns 0 if no identifier was matched, in which case the identifier
+ * argument is replaced with an empty string, or non 0 if the identifier was
+ * found.
+ */
+int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier)
+{
+  int end;
+  int pos;
+
+  assert(identifier != NULL);
+
+  end = -1;
+  /* Ignore the optional whitespace delimiter */
+  if (js_is_whitespace(jsparser_buffer_get(js, -1))) {
+    --end;
+  }
+
+  /* Find the beginning of the identifier. This loop ends either when we find a
+   * character that doesn't belong to an identifier, or when we find a '\0'
+   * character, which means we reached the end of the buffer. */
+  for(pos = end; js_is_identifier(jsparser_buffer_get(js, pos)); --pos) {
+  }
+  if (pos + 1 >= end) {
+    identifier[0] = '\0';
+    return 0;
+  }
+
+  jsparser_buffer_slice(js, identifier, pos + 1, end);
+  return 1;
+}
+
+/* Callback used in bsearch() for comparing a string against an array of
+ * strings.
+ */
+static int bsearch_strcmp(const void *a, const void *b)
+{
+  return strcmp(CAST(const char*, a), *CAST(const char * const *, b));
+}
+
+/* Returns true if the token argument can be a token prefix to a javascript
+ * regular expression.
+ *
+ * The token argument is compared against a list of identifiers that can
+ * precede a regular expression in the javascript grammar, and returns true if
+ * the argument is found on that list.
+ */
+static inline int is_regexp_token_prefix(char *token)
+{
+  assert(token != NULL);
+
+  return bsearch(token,
+                 regexp_token_prefix,
+                 sizeof(regexp_token_prefix) / sizeof(char *),
+                 sizeof(char *), bsearch_strcmp) != NULL;
+}
+
+/* Called for every character in state text.
+ *
+ * We copy every character we find when we are in state text to the ring
+ * buffer. This has the side effect of also pushing slash characters that are
+ * part of comments into the buffer, although for parsing purposes these should
+ * be treated as whitespace. This issue is addressed in
+ * enter_state_js_comment_ml_after().
+ */
+static void in_state_js_text(statemachine_ctx *ctx, int start, char chr,
+                             int end)
+{
+  jsparser_ctx *js = CAST(jsparser_ctx *, ctx->user);
+  assert(js != NULL);
+
+  (void)start;
+  (void)end;
+
+  jsparser_buffer_append_chr(js, chr);
+}
+
+/* This function is called every time we find a slash ('/') character in the
+ * javascript text (except for slashes that close comments or regexp literals).
+ *
+ * Implements the logic to figure out if this slash character is a division
+ * operator or if it opens a regular expression literal. This is heavily
+ * inspired by the syntactic resynchronization for javascript 2.0:
+ * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
+ *
+ * When we receive a '/', we look at the previous non space character to figure
+ * out if it's the ending of a punctuator that can precede a regexp literal, in
+ * which case we assume the current '/' is part of a regular expression literal
+ * (or the opening of a javascript comment, but that part is dealt with in the
+ * state machine). The exceptions to this are unary operators, so we look back
+ * a second character to rule out '++' and '--'. Although it is not
+ * straightforward to figure out if the binary operator is a postfix of the
+ * previous expression or a prefix of the regular expression, we rule out the
+ * later as it is an uncommon practice.
+ *
+ * If we ruled out the previous token to be a valid regexp preceding
+ * punctuator, we extract the last identifier in the buffer and match against a
+ * list of keywords that are known to precede expressions in the grammar. If we
+ * get a match on any of these keywords, then we are opening a regular
+ * expression, if not, then we have a division operator.
+ *
+ * Known cases that are accepted by the grammar but we handle differently,
+ * although I don't believe there is a legitimate usage for those:
+ *
+ * Division of a regular expression:
+ * var result = /test/ / 5;
+ *
+ * Prefix unary increment of a regular expression:
+ * var result = ++/test/;
+ *
+ * Division of an object literal:
+ * { a: 1 } /x/.exec('x');
+ *
+ * We only support ascii right now, so unicode characters in identifiers will
+ * be treated as delimiters, effectively breaking the identifier name where
+ * they appear, and this may cause issues in very specific situations. Namely,
+ * if we have a unicode character in an identifier directly preceding a suffix
+ * that matches one of the keywords in regexp_token_prefix[], if this
+ * identifier precedes a / (slash) character:
+ *
+ * var x = test<unicode char>return / 5;
+ *
+ * We will interpret that slash as the start of a regular expression, when in
+ * reality it is a division operator.
+ */
+static void enter_state_js_slash(statemachine_ctx *ctx, int start, char chr,
+                                 int end)
+{
+  jsparser_ctx *js;
+  char buffer[JSPARSER_RING_BUFFER_SIZE];
+  int pos;
+
+  (void)start;
+  (void)end;
+
+  assert(ctx != NULL);
+  assert(ctx->user != NULL);
+
+  js = CAST(jsparser_ctx *, ctx->user);
+  assert(js != NULL);
+
+  pos = -1;
+  /* Consume the last whitespace. */
+  if (js_is_whitespace(jsparser_buffer_get(js, pos))) {
+    --pos;
+  }
+
+  switch (jsparser_buffer_get(js, pos)) {
+    /* Ignore unary increment */
+    case '+':
+      if (jsparser_buffer_get(js, pos - 1) != '+') {
+        ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
+      }
+      break;
+
+    /* Ignore unary decrement */
+    case '-':
+      if (jsparser_buffer_get(js, pos - 1) != '-') {
+        ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
+      }
+      break;
+
+    /* List of punctuator endings except ), ], }, + and - */
+    case '=':
+    case '<':
+    case '>':
+    case '&':
+    case '|':
+    case '!':
+    case '%':
+    case '*':
+    case '/':
+    case ',':
+    case ';':
+    case '?':
+    case ':':
+    case '^':
+    case '~':
+    case '{':
+    case '(':
+    case '[':
+    case '}':
+    case '\0':
+      ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
+      break;
+
+    default:
+      if (jsparser_buffer_last_identifier(js, buffer) &&
+          is_regexp_token_prefix(buffer)) {
+        ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
+      }
+  }
+
+  jsparser_buffer_append_chr(js, chr);
+}
+
+/* Called at the end of a javascript comment.
+ *
+ * When we open a comment, the initial '/' was inserted into the ring buffer,
+ * but it is not a token and should be considered whitespace for parsing
+ * purposes.
+ *
+ * When we first saw the '/' character, we didn't yet know if it was the
+ * beginning of a comment, a division operator, or a regexp.
+ *
+ * In this function we just replace the inital '/' with a whitespace character,
+ * unless we had a preceding whitespace character, in which case we just remove
+ * the '/'. This is needed to ensure all spaces in the buffer are correctly
+ * folded.
+ */
+static void enter_state_js_comment_after(statemachine_ctx *ctx, int start,
+                                         char chr, int end)
+{
+  jsparser_ctx *js;
+
+  assert(ctx != NULL);
+  assert(ctx->user != NULL);
+
+  (void)start;
+  (void)chr;
+  (void)end;
+  
+  js = CAST(jsparser_ctx *, ctx->user);
+
+  if (js_is_whitespace(jsparser_buffer_get(js, -2))) {
+    (void)jsparser_buffer_pop(js);
+  } else {
+    jsparser_buffer_set(js, -1, ' ');
+  }
+}
+
+static statemachine_definition *create_statemachine_definition(void)
+{
+  statemachine_definition *def;
+  def = statemachine_definition_new(JSPARSER_NUM_STATES);
+  if (def == NULL)
+    return NULL;
+
+  /* TODO(falmeida): Check return value. */
+  statemachine_definition_populate(def, jsparser_state_transitions,
+                                   jsparser_states_internal_names);
+
+  statemachine_in_state(def, JSPARSER_STATE_INT_JS_TEXT,
+                        in_state_js_text);
+  statemachine_enter_state(def, JSPARSER_STATE_INT_JS_SLASH,
+                           enter_state_js_slash);
+  statemachine_enter_state(def, JSPARSER_STATE_INT_JS_COMMENT_AFTER,
+                           enter_state_js_comment_after);
+
+  return def;
+}
+
+
+/* Initializes a new jsparser instance.
+ *
+ * Returns a pointer to the new instance or NULL if the initialization
+ * fails.
+ *
+ * Initialization failure is fatal, and if this function fails it may not
+ * deallocate all previsouly allocated memory.
+ */
+
+jsparser_ctx *jsparser_new(void)
+{
+    jsparser_ctx *js;
+
+    js = CAST(jsparser_ctx *, calloc(1, sizeof(jsparser_ctx)));
+    if (js == NULL)
+      return NULL;
+
+    js->statemachine_def = create_statemachine_definition();
+    if (js->statemachine_def == NULL)
+      return NULL;
+
+    js->statemachine = statemachine_new(js->statemachine_def, js);
+    if (js->statemachine == NULL)
+      return NULL;
+
+    jsparser_reset(js);
+
+    return js;
+}
+
+/* Returns a pointer to a context which is a duplicate of the jsparser src.
+ */
+jsparser_ctx *jsparser_duplicate(jsparser_ctx *src)
+{
+  jsparser_ctx *dst;
+  assert(src != NULL);
+
+  dst = jsparser_new();
+  if (dst == NULL)
+    return NULL;
+
+  jsparser_copy(dst, src);
+
+  return dst;
+}
+
+/* Copies the context of the jsparser pointed to by src to the jsparser dst.
+ *
+ * The state machine definition is preserved since it is read only.
+ */
+void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src)
+{
+
+  dst->buffer_start = src->buffer_start;
+  dst->buffer_end = src->buffer_end;
+  memcpy(dst->buffer, src->buffer, sizeof(src->buffer));
+
+  statemachine_copy(dst->statemachine,
+                    src->statemachine,
+                    dst->statemachine_def,
+                    dst);
+
+}
+
+void jsparser_reset(jsparser_ctx *ctx)
+{
+  assert(ctx != NULL);
+  ctx->statemachine->current_state = 0;
+  ctx->buffer_start = 0;
+  ctx->buffer_end = 0;
+}
+
+int jsparser_state(jsparser_ctx *ctx)
+{
+  return state_external(ctx->statemachine->current_state);
+}
+
+int jsparser_parse(jsparser_ctx *ctx, const char *str, int size)
+{
+    int internal_state;
+    internal_state = statemachine_parse(ctx->statemachine, str, size);
+    return state_external(internal_state);
+}
+
+void jsparser_delete(jsparser_ctx *ctx)
+{
+    assert(ctx != NULL);
+    statemachine_delete(ctx->statemachine);
+    statemachine_definition_delete(ctx->statemachine_def);
+    free(ctx);
+}
author	Andreas Baumann <abaumann@yahoo.com>	2012-07-14 17:16:21 +0200
committer	Andreas Baumann <abaumann@yahoo.com>	2012-07-14 17:16:21 +0200
commit	54cce110784d33d658b5f78286a98bee244a9eeb (patch)
tree	9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/jsparser.c
parent	fcb682cb1955d362390665330fdf476cab7dc10b (diff)
download	crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2