added streamhtmlparser

author: Andreas Baumann <abaumann@yahoo.com> 2012-07-14 17:16:21 +0200
committer: Andreas Baumann <abaumann@yahoo.com> 2012-07-14 17:16:21 +0200
commit: 54cce110784d33d658b5f78286a98bee244a9eeb (patch)
tree: 9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/htmlparser_fsm.config
parent: fcb682cb1955d362390665330fdf476cab7dc10b (diff)
download: crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz
crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2
1 files changed, 336 insertions, 0 deletions
diff --git a/streamhtmlparser/htmlparser_fsm.config b/streamhtmlparser/htmlparser_fsm.config
new file mode 100644
index 0000000..e80d055
--- /dev/null
+++ b/streamhtmlparser/htmlparser_fsm.config
@@ -0,0 +1,336 @@
+# Copyright 2008 Google Inc. All Rights Reserved.
+# Author: falmeida@google.com (Filipe Almeida)
+
+# TODO(falmeida): Add more descriptive names to the states and drop the
+# abbreviations.
+# TODO(falmeida): Reorder the states so that it's easier to read.
+# TODO(falmeida): Support CDATA blocks in the form: <![CDATA[.
+
+name = 'htmlparser'
+
+comment = 'Definition of a finite state machine for a subset of HTTP 4.1'
+
+condition('<', '<')
+condition('>', '>')
+condition('=', '=')
+
+# TODO(falmeida): This is not the correct expression. tag and attribute names
+# can only consist of alpha character.
+condition('id', 'A-Za-z0-9_:-')
+condition('idtag', 'A-Za-z0-9/_:-')
+
+# Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1
+condition('space', ' \t\n\r')
+condition('!', '!')
+condition('q', '\'')
+condition('dq', '\"')
+condition('/', '/')
+condition('*', '*')
+condition('-', '-')
+condition('?', '?')
+condition('lf', '\n')
+condition('quote', '\\')
+
+# TODO(falmeida): This default rule is a hack and shouldn't be here.
+condition('default', '[:default:]')
+
+state(name = 'text',
+      external = 'text',
+      transitions = [
+        ['<', 'tag_start'],
+        ['default', 'text']
+      ])
+
+# When we found the < character in text.
+# Tag opening is defined in the HTML5 draft here:
+# http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state
+# We don't exactly follow this and are much more loose in order to mimic the way
+# the major browsers behave.
+state(name = 'tag_start',
+      external = 'tag',
+      transitions = [
+        ['idtag', 'tag_name'],
+        ['?', 'pi'],
+        ['!', 'declaration_start'],
+        ['<', 'tag_start'],
+        ['default', 'text']
+      ])
+
+# Name of the tag. Includes the closing tag character '/'.
+state(name = 'tag_name',
+      external = 'tag',
+      transitions = [
+        ['idtag', 'tag_name'],
+        ['space', 'tag_space'],
+        ['>', 'tag_close']
+      ])
+
+# HTML declaration and comment parsing
+#
+# We don't expose declaration state because at this point we only want to
+# ensure that we are parsing them correctly so we don't get out of sync.
+# This is specifically made for DOCTYPE declarations and won't work if DTD's
+# are defined inside the declaration.
+# The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML'
+# but that will add a lot of unecessary states, and unless we build a simple
+# declarative way to unfold a string match into multiple states, I don't
+# think it's worth worrying about for now.
+
+# Got '<!'. The next character will decide if we open a declaration or a
+# comment.
+state(name = 'declaration_start',
+      external = 'text',
+      transitions = [
+        ['-', 'comment_open'],
+        ['>', 'text'],
+        ['default', 'declaration_body']
+      ])
+
+# Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>'
+state(name = 'declaration_body',
+      external = 'text',
+      transitions = [
+        ['>', 'text'],
+        ['default', 'declaration_body']
+      ])
+
+# Got '<!-'.
+state(name = 'comment_open',
+      external = 'text',
+      transitions = [
+        ['-', 'comment_body'],
+        ['default', 'text']
+      ])
+
+# Inside a comment. We only close when we see '-->'
+state(name = 'comment_body',
+      external = 'comment',
+      transitions = [
+        ['-', 'comment_dash'],
+        ['default', 'comment_body']
+      ])
+
+# Got '-' inside a comment.
+state(name = 'comment_dash',
+      external = 'comment',
+      transitions = [
+        ['-', 'comment_dash_dash'],
+        ['default', 'comment_body']
+      ])
+
+# Got '--' inside a comment.
+state(name = 'comment_dash_dash',
+      external = 'comment',
+      transitions = [
+        ['-', 'comment_dash_dash'],
+        ['>', 'text'],
+        ['default', 'comment_body']
+      ])
+
+# XML Processing instruction parsing according to:
+# http://www.w3.org/TR/REC-xml/#sec-pi
+#
+# Everything between the characters <? and ?> is considered to be part of the
+# processing instruction.
+state(name = 'pi',
+      external = 'text',
+      transitions = [
+        ['?', 'pi_may_end'],
+        ['default', 'pi']
+      ])
+
+state(name = 'pi_may_end',
+      external = 'text',
+      transitions = [
+        ['>', 'text'],
+        ['default', 'pi']
+      ])
+
+# Whitespace between tag name, attributes.
+state(name = 'tag_space',
+      external = 'tag',
+      transitions = [
+        ['>', 'tag_close'],
+        ['space', 'tag_space'],
+        ['id', 'attr'],
+        ['/', 'tag_space']
+      ])
+
+state(name = 'tag_close',
+      external = 'text',
+      transitions = [
+        ['<', 'tag_start'],
+        ['default', 'text']
+      ])
+
+# Name of the attribute.
+state(name = 'attr',
+      external = 'attr',
+      transitions = [
+        ['id', 'attr'],
+        ['>', 'tag_close'],
+        ['/', 'tag_space'],
+        ['=', 'value'],
+        ['space', 'attr_space']
+      ])
+
+# After the attribute name.
+state(name = 'attr_space',
+      external = 'attr',
+      transitions = [
+        ['>', 'tag_close'],
+        ['space', 'attr_space'],
+        ['id', 'attr'],
+        ['/', 'tag_space'],
+        ['=', 'value']
+      ])
+
+# Expecting a value, after attribute=
+state(name = 'value',
+      external = 'value',
+      transitions = [
+        ['q', 'value_q_start'],
+        ['dq', 'value_dq_start'],
+        ['space', 'value'],
+        ['>', 'tag_close'],
+        ['default', 'value_text']
+      ])
+
+# Unquoted attribute value.
+state(name = 'value_text',
+      external = 'value',
+      transitions = [
+        ['>', 'tag_close'],
+        ['space', 'tag_space'],
+        ['default', 'value_text']
+      ])
+
+# First character of a single quoted attribute value.
+state(name = 'value_q_start',
+      external = 'value',
+      transitions = [
+        ['q', 'tag_space'],
+        ['default', 'value_q']
+      ])
+
+# In the middle of a single quoted attribute value.
+state(name = 'value_q',
+      external = 'value',
+      transitions = [
+        ['q', 'tag_space'],
+        ['default', 'value_q']
+      ])
+
+# First character of a double quoted attribute value.
+state(name = 'value_dq_start',
+      external = 'value',
+      transitions = [
+        ['dq', 'tag_space'],
+        ['default', 'value_dq']
+      ])
+
+# In the middle of a double quoted attribute value.
+state(name = 'value_dq',
+      external = 'value',
+      transitions = [
+        ['dq', 'tag_space'],
+        ['default', 'value_dq']
+      ])
+
+# CDATA escaping text spans.
+# TODO(falmeida): These states should go after cdata_text.
+
+# Got '<!'
+state(name = 'cdata_comment_start',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_start_dash'],
+        ['default', 'cdata_text'],
+      ])
+
+# Got '<!-'.
+state(name = 'cdata_comment_start_dash',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_body'],
+        ['default', 'cdata_text']
+      ])
+
+# Inside a comment
+state(name = 'cdata_comment_body',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_dash'],
+        ['default', 'cdata_comment_body']
+      ])
+
+# Got '-' inside a comment.
+state(name = 'cdata_comment_dash',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_dash_dash'],
+        ['default', 'cdata_comment_body']
+      ])
+
+# Got '--' inside a comment.
+state(name = 'cdata_comment_dash_dash',
+      external = 'text',
+      transitions = [
+        ['-', 'cdata_comment_dash_dash'],
+        ['>', 'cdata_text'],
+        ['default', 'cdata_comment_body']
+      ])
+
+# CDATA processing
+#
+# To simplify the code, we treat RCDATA and CDATA sections the same since the
+# differences between them don't affect the context we are in.
+state(name = 'cdata_text',
+      external = 'text',
+      transitions = [
+        ['<', 'cdata_lt'],
+        ['default', 'cdata_text']
+      ])
+
+# Possible beginning of the closing tag.
+state(name = 'cdata_lt',
+      external = 'text',
+      transitions = [
+        ['/', 'cdata_may_close'],
+        ['!', 'cdata_comment_start'],
+        ['default', 'cdata_text']
+      ])
+
+# If we encounter </tag where tag matches the last opened tag, we exit the
+# CDATA section. Part of this logic is handled in the code.
+state(name = 'cdata_may_close',
+      external = 'text',
+      transitions = [
+        ['idtag', 'cdata_may_close'],
+        ['>', 'text'],
+        ['space', 'tag_space'],
+        ['default', 'cdata_text']
+      ])
+
+# The next states are used for specialized parser modes.
+state(name = 'js_file',
+      external = 'js_file',
+      transitions = [
+        ['default', 'js_file']
+      ])
+
+# TODO(falmeida): Having css_file and js_file as the external name doesn't make
+#                 sense. This should instead be text and the js/css state be
+#                 returned by # in_js() and in_css().
+state(name = 'css_file',
+      external = 'css_file',
+      transitions = [
+        ['default', 'css_file']
+      ])
+
+state(name = 'null',
+      external = 'text',
+      transitions = [
+        ['default', 'null']
+      ])
+
author	Andreas Baumann <abaumann@yahoo.com>	2012-07-14 17:16:21 +0200
committer	Andreas Baumann <abaumann@yahoo.com>	2012-07-14 17:16:21 +0200
commit	54cce110784d33d658b5f78286a98bee244a9eeb (patch)
tree	9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/htmlparser_fsm.config
parent	fcb682cb1955d362390665330fdf476cab7dc10b (diff)
download	crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2