summaryrefslogtreecommitdiff
path: root/streamhtmlparser/htmlparser_fsm.config
diff options
context:
space:
mode:
authorAndreas Baumann <abaumann@yahoo.com>2012-07-14 17:16:21 +0200
committerAndreas Baumann <abaumann@yahoo.com>2012-07-14 17:16:21 +0200
commit54cce110784d33d658b5f78286a98bee244a9eeb (patch)
tree9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/htmlparser_fsm.config
parentfcb682cb1955d362390665330fdf476cab7dc10b (diff)
downloadcrawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz
crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2
added streamhtmlparser
Diffstat (limited to 'streamhtmlparser/htmlparser_fsm.config')
-rw-r--r--streamhtmlparser/htmlparser_fsm.config336
1 files changed, 336 insertions, 0 deletions
diff --git a/streamhtmlparser/htmlparser_fsm.config b/streamhtmlparser/htmlparser_fsm.config
new file mode 100644
index 0000000..e80d055
--- /dev/null
+++ b/streamhtmlparser/htmlparser_fsm.config
@@ -0,0 +1,336 @@
+# Copyright 2008 Google Inc. All Rights Reserved.
+# Author: falmeida@google.com (Filipe Almeida)
+
+# TODO(falmeida): Add more descriptive names to the states and drop the
+# abbreviations.
+# TODO(falmeida): Reorder the states so that it's easier to read.
+# TODO(falmeida): Support CDATA blocks in the form: <![CDATA[.
+
+name = 'htmlparser'
+
+comment = 'Definition of a finite state machine for a subset of HTTP 4.1'
+
+condition('<', '<')
+condition('>', '>')
+condition('=', '=')
+
+# TODO(falmeida): This is not the correct expression. tag and attribute names
+# can only consist of alpha character.
+condition('id', 'A-Za-z0-9_:-')
+condition('idtag', 'A-Za-z0-9/_:-')
+
+# Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1
+condition('space', ' \t\n\r')
+condition('!', '!')
+condition('q', '\'')
+condition('dq', '\"')
+condition('/', '/')
+condition('*', '*')
+condition('-', '-')
+condition('?', '?')
+condition('lf', '\n')
+condition('quote', '\\')
+
+# TODO(falmeida): This default rule is a hack and shouldn't be here.
+condition('default', '[:default:]')
+
+state(name = 'text',
+ external = 'text',
+ transitions = [
+ ['<', 'tag_start'],
+ ['default', 'text']
+ ])
+
+# When we found the < character in text.
+# Tag opening is defined in the HTML5 draft here:
+# http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state
+# We don't exactly follow this and are much more loose in order to mimic the way
+# the major browsers behave.
+state(name = 'tag_start',
+ external = 'tag',
+ transitions = [
+ ['idtag', 'tag_name'],
+ ['?', 'pi'],
+ ['!', 'declaration_start'],
+ ['<', 'tag_start'],
+ ['default', 'text']
+ ])
+
+# Name of the tag. Includes the closing tag character '/'.
+state(name = 'tag_name',
+ external = 'tag',
+ transitions = [
+ ['idtag', 'tag_name'],
+ ['space', 'tag_space'],
+ ['>', 'tag_close']
+ ])
+
+# HTML declaration and comment parsing
+#
+# We don't expose declaration state because at this point we only want to
+# ensure that we are parsing them correctly so we don't get out of sync.
+# This is specifically made for DOCTYPE declarations and won't work if DTD's
+# are defined inside the declaration.
+# The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML'
+# but that will add a lot of unecessary states, and unless we build a simple
+# declarative way to unfold a string match into multiple states, I don't
+# think it's worth worrying about for now.
+
+# Got '<!'. The next character will decide if we open a declaration or a
+# comment.
+state(name = 'declaration_start',
+ external = 'text',
+ transitions = [
+ ['-', 'comment_open'],
+ ['>', 'text'],
+ ['default', 'declaration_body']
+ ])
+
+# Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>'
+state(name = 'declaration_body',
+ external = 'text',
+ transitions = [
+ ['>', 'text'],
+ ['default', 'declaration_body']
+ ])
+
+# Got '<!-'.
+state(name = 'comment_open',
+ external = 'text',
+ transitions = [
+ ['-', 'comment_body'],
+ ['default', 'text']
+ ])
+
+# Inside a comment. We only close when we see '-->'
+state(name = 'comment_body',
+ external = 'comment',
+ transitions = [
+ ['-', 'comment_dash'],
+ ['default', 'comment_body']
+ ])
+
+# Got '-' inside a comment.
+state(name = 'comment_dash',
+ external = 'comment',
+ transitions = [
+ ['-', 'comment_dash_dash'],
+ ['default', 'comment_body']
+ ])
+
+# Got '--' inside a comment.
+state(name = 'comment_dash_dash',
+ external = 'comment',
+ transitions = [
+ ['-', 'comment_dash_dash'],
+ ['>', 'text'],
+ ['default', 'comment_body']
+ ])
+
+# XML Processing instruction parsing according to:
+# http://www.w3.org/TR/REC-xml/#sec-pi
+#
+# Everything between the characters <? and ?> is considered to be part of the
+# processing instruction.
+state(name = 'pi',
+ external = 'text',
+ transitions = [
+ ['?', 'pi_may_end'],
+ ['default', 'pi']
+ ])
+
+state(name = 'pi_may_end',
+ external = 'text',
+ transitions = [
+ ['>', 'text'],
+ ['default', 'pi']
+ ])
+
+# Whitespace between tag name, attributes.
+state(name = 'tag_space',
+ external = 'tag',
+ transitions = [
+ ['>', 'tag_close'],
+ ['space', 'tag_space'],
+ ['id', 'attr'],
+ ['/', 'tag_space']
+ ])
+
+state(name = 'tag_close',
+ external = 'text',
+ transitions = [
+ ['<', 'tag_start'],
+ ['default', 'text']
+ ])
+
+# Name of the attribute.
+state(name = 'attr',
+ external = 'attr',
+ transitions = [
+ ['id', 'attr'],
+ ['>', 'tag_close'],
+ ['/', 'tag_space'],
+ ['=', 'value'],
+ ['space', 'attr_space']
+ ])
+
+# After the attribute name.
+state(name = 'attr_space',
+ external = 'attr',
+ transitions = [
+ ['>', 'tag_close'],
+ ['space', 'attr_space'],
+ ['id', 'attr'],
+ ['/', 'tag_space'],
+ ['=', 'value']
+ ])
+
+# Expecting a value, after attribute=
+state(name = 'value',
+ external = 'value',
+ transitions = [
+ ['q', 'value_q_start'],
+ ['dq', 'value_dq_start'],
+ ['space', 'value'],
+ ['>', 'tag_close'],
+ ['default', 'value_text']
+ ])
+
+# Unquoted attribute value.
+state(name = 'value_text',
+ external = 'value',
+ transitions = [
+ ['>', 'tag_close'],
+ ['space', 'tag_space'],
+ ['default', 'value_text']
+ ])
+
+# First character of a single quoted attribute value.
+state(name = 'value_q_start',
+ external = 'value',
+ transitions = [
+ ['q', 'tag_space'],
+ ['default', 'value_q']
+ ])
+
+# In the middle of a single quoted attribute value.
+state(name = 'value_q',
+ external = 'value',
+ transitions = [
+ ['q', 'tag_space'],
+ ['default', 'value_q']
+ ])
+
+# First character of a double quoted attribute value.
+state(name = 'value_dq_start',
+ external = 'value',
+ transitions = [
+ ['dq', 'tag_space'],
+ ['default', 'value_dq']
+ ])
+
+# In the middle of a double quoted attribute value.
+state(name = 'value_dq',
+ external = 'value',
+ transitions = [
+ ['dq', 'tag_space'],
+ ['default', 'value_dq']
+ ])
+
+# CDATA escaping text spans.
+# TODO(falmeida): These states should go after cdata_text.
+
+# Got '<!'
+state(name = 'cdata_comment_start',
+ external = 'text',
+ transitions = [
+ ['-', 'cdata_comment_start_dash'],
+ ['default', 'cdata_text'],
+ ])
+
+# Got '<!-'.
+state(name = 'cdata_comment_start_dash',
+ external = 'text',
+ transitions = [
+ ['-', 'cdata_comment_body'],
+ ['default', 'cdata_text']
+ ])
+
+# Inside a comment
+state(name = 'cdata_comment_body',
+ external = 'text',
+ transitions = [
+ ['-', 'cdata_comment_dash'],
+ ['default', 'cdata_comment_body']
+ ])
+
+# Got '-' inside a comment.
+state(name = 'cdata_comment_dash',
+ external = 'text',
+ transitions = [
+ ['-', 'cdata_comment_dash_dash'],
+ ['default', 'cdata_comment_body']
+ ])
+
+# Got '--' inside a comment.
+state(name = 'cdata_comment_dash_dash',
+ external = 'text',
+ transitions = [
+ ['-', 'cdata_comment_dash_dash'],
+ ['>', 'cdata_text'],
+ ['default', 'cdata_comment_body']
+ ])
+
+# CDATA processing
+#
+# To simplify the code, we treat RCDATA and CDATA sections the same since the
+# differences between them don't affect the context we are in.
+state(name = 'cdata_text',
+ external = 'text',
+ transitions = [
+ ['<', 'cdata_lt'],
+ ['default', 'cdata_text']
+ ])
+
+# Possible beginning of the closing tag.
+state(name = 'cdata_lt',
+ external = 'text',
+ transitions = [
+ ['/', 'cdata_may_close'],
+ ['!', 'cdata_comment_start'],
+ ['default', 'cdata_text']
+ ])
+
+# If we encounter </tag where tag matches the last opened tag, we exit the
+# CDATA section. Part of this logic is handled in the code.
+state(name = 'cdata_may_close',
+ external = 'text',
+ transitions = [
+ ['idtag', 'cdata_may_close'],
+ ['>', 'text'],
+ ['space', 'tag_space'],
+ ['default', 'cdata_text']
+ ])
+
+# The next states are used for specialized parser modes.
+state(name = 'js_file',
+ external = 'js_file',
+ transitions = [
+ ['default', 'js_file']
+ ])
+
+# TODO(falmeida): Having css_file and js_file as the external name doesn't make
+# sense. This should instead be text and the js/css state be
+# returned by # in_js() and in_css().
+state(name = 'css_file',
+ external = 'css_file',
+ transitions = [
+ ['default', 'css_file']
+ ])
+
+state(name = 'null',
+ external = 'text',
+ transitions = [
+ ['default', 'null']
+ ])
+