diff options
author | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 17:16:21 +0200 |
---|---|---|
committer | Andreas Baumann <abaumann@yahoo.com> | 2012-07-14 17:16:21 +0200 |
commit | 54cce110784d33d658b5f78286a98bee244a9eeb (patch) | |
tree | 9c4d998343e7dc88323ae8ef6d5a04c6b958df9c /streamhtmlparser/htmlparser_fsm.config | |
parent | fcb682cb1955d362390665330fdf476cab7dc10b (diff) | |
download | crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.gz crawler-54cce110784d33d658b5f78286a98bee244a9eeb.tar.bz2 |
added streamhtmlparser
Diffstat (limited to 'streamhtmlparser/htmlparser_fsm.config')
-rw-r--r-- | streamhtmlparser/htmlparser_fsm.config | 336 |
1 files changed, 336 insertions, 0 deletions
diff --git a/streamhtmlparser/htmlparser_fsm.config b/streamhtmlparser/htmlparser_fsm.config new file mode 100644 index 0000000..e80d055 --- /dev/null +++ b/streamhtmlparser/htmlparser_fsm.config @@ -0,0 +1,336 @@ +# Copyright 2008 Google Inc. All Rights Reserved. +# Author: falmeida@google.com (Filipe Almeida) + +# TODO(falmeida): Add more descriptive names to the states and drop the +# abbreviations. +# TODO(falmeida): Reorder the states so that it's easier to read. +# TODO(falmeida): Support CDATA blocks in the form: <![CDATA[. + +name = 'htmlparser' + +comment = 'Definition of a finite state machine for a subset of HTTP 4.1' + +condition('<', '<') +condition('>', '>') +condition('=', '=') + +# TODO(falmeida): This is not the correct expression. tag and attribute names +# can only consist of alpha character. +condition('id', 'A-Za-z0-9_:-') +condition('idtag', 'A-Za-z0-9/_:-') + +# Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1 +condition('space', ' \t\n\r') +condition('!', '!') +condition('q', '\'') +condition('dq', '\"') +condition('/', '/') +condition('*', '*') +condition('-', '-') +condition('?', '?') +condition('lf', '\n') +condition('quote', '\\') + +# TODO(falmeida): This default rule is a hack and shouldn't be here. +condition('default', '[:default:]') + +state(name = 'text', + external = 'text', + transitions = [ + ['<', 'tag_start'], + ['default', 'text'] + ]) + +# When we found the < character in text. +# Tag opening is defined in the HTML5 draft here: +# http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state +# We don't exactly follow this and are much more loose in order to mimic the way +# the major browsers behave. +state(name = 'tag_start', + external = 'tag', + transitions = [ + ['idtag', 'tag_name'], + ['?', 'pi'], + ['!', 'declaration_start'], + ['<', 'tag_start'], + ['default', 'text'] + ]) + +# Name of the tag. Includes the closing tag character '/'. +state(name = 'tag_name', + external = 'tag', + transitions = [ + ['idtag', 'tag_name'], + ['space', 'tag_space'], + ['>', 'tag_close'] + ]) + +# HTML declaration and comment parsing +# +# We don't expose declaration state because at this point we only want to +# ensure that we are parsing them correctly so we don't get out of sync. +# This is specifically made for DOCTYPE declarations and won't work if DTD's +# are defined inside the declaration. +# The HTML5 spec says we should specificly look for the string '<!DOCTYPE HTML' +# but that will add a lot of unecessary states, and unless we build a simple +# declarative way to unfold a string match into multiple states, I don't +# think it's worth worrying about for now. + +# Got '<!'. The next character will decide if we open a declaration or a +# comment. +state(name = 'declaration_start', + external = 'text', + transitions = [ + ['-', 'comment_open'], + ['>', 'text'], + ['default', 'declaration_body'] + ]) + +# Inside a declaration. Ie: <!DOCTYPE. We close when we see a '>' +state(name = 'declaration_body', + external = 'text', + transitions = [ + ['>', 'text'], + ['default', 'declaration_body'] + ]) + +# Got '<!-'. +state(name = 'comment_open', + external = 'text', + transitions = [ + ['-', 'comment_body'], + ['default', 'text'] + ]) + +# Inside a comment. We only close when we see '-->' +state(name = 'comment_body', + external = 'comment', + transitions = [ + ['-', 'comment_dash'], + ['default', 'comment_body'] + ]) + +# Got '-' inside a comment. +state(name = 'comment_dash', + external = 'comment', + transitions = [ + ['-', 'comment_dash_dash'], + ['default', 'comment_body'] + ]) + +# Got '--' inside a comment. +state(name = 'comment_dash_dash', + external = 'comment', + transitions = [ + ['-', 'comment_dash_dash'], + ['>', 'text'], + ['default', 'comment_body'] + ]) + +# XML Processing instruction parsing according to: +# http://www.w3.org/TR/REC-xml/#sec-pi +# +# Everything between the characters <? and ?> is considered to be part of the +# processing instruction. +state(name = 'pi', + external = 'text', + transitions = [ + ['?', 'pi_may_end'], + ['default', 'pi'] + ]) + +state(name = 'pi_may_end', + external = 'text', + transitions = [ + ['>', 'text'], + ['default', 'pi'] + ]) + +# Whitespace between tag name, attributes. +state(name = 'tag_space', + external = 'tag', + transitions = [ + ['>', 'tag_close'], + ['space', 'tag_space'], + ['id', 'attr'], + ['/', 'tag_space'] + ]) + +state(name = 'tag_close', + external = 'text', + transitions = [ + ['<', 'tag_start'], + ['default', 'text'] + ]) + +# Name of the attribute. +state(name = 'attr', + external = 'attr', + transitions = [ + ['id', 'attr'], + ['>', 'tag_close'], + ['/', 'tag_space'], + ['=', 'value'], + ['space', 'attr_space'] + ]) + +# After the attribute name. +state(name = 'attr_space', + external = 'attr', + transitions = [ + ['>', 'tag_close'], + ['space', 'attr_space'], + ['id', 'attr'], + ['/', 'tag_space'], + ['=', 'value'] + ]) + +# Expecting a value, after attribute= +state(name = 'value', + external = 'value', + transitions = [ + ['q', 'value_q_start'], + ['dq', 'value_dq_start'], + ['space', 'value'], + ['>', 'tag_close'], + ['default', 'value_text'] + ]) + +# Unquoted attribute value. +state(name = 'value_text', + external = 'value', + transitions = [ + ['>', 'tag_close'], + ['space', 'tag_space'], + ['default', 'value_text'] + ]) + +# First character of a single quoted attribute value. +state(name = 'value_q_start', + external = 'value', + transitions = [ + ['q', 'tag_space'], + ['default', 'value_q'] + ]) + +# In the middle of a single quoted attribute value. +state(name = 'value_q', + external = 'value', + transitions = [ + ['q', 'tag_space'], + ['default', 'value_q'] + ]) + +# First character of a double quoted attribute value. +state(name = 'value_dq_start', + external = 'value', + transitions = [ + ['dq', 'tag_space'], + ['default', 'value_dq'] + ]) + +# In the middle of a double quoted attribute value. +state(name = 'value_dq', + external = 'value', + transitions = [ + ['dq', 'tag_space'], + ['default', 'value_dq'] + ]) + +# CDATA escaping text spans. +# TODO(falmeida): These states should go after cdata_text. + +# Got '<!' +state(name = 'cdata_comment_start', + external = 'text', + transitions = [ + ['-', 'cdata_comment_start_dash'], + ['default', 'cdata_text'], + ]) + +# Got '<!-'. +state(name = 'cdata_comment_start_dash', + external = 'text', + transitions = [ + ['-', 'cdata_comment_body'], + ['default', 'cdata_text'] + ]) + +# Inside a comment +state(name = 'cdata_comment_body', + external = 'text', + transitions = [ + ['-', 'cdata_comment_dash'], + ['default', 'cdata_comment_body'] + ]) + +# Got '-' inside a comment. +state(name = 'cdata_comment_dash', + external = 'text', + transitions = [ + ['-', 'cdata_comment_dash_dash'], + ['default', 'cdata_comment_body'] + ]) + +# Got '--' inside a comment. +state(name = 'cdata_comment_dash_dash', + external = 'text', + transitions = [ + ['-', 'cdata_comment_dash_dash'], + ['>', 'cdata_text'], + ['default', 'cdata_comment_body'] + ]) + +# CDATA processing +# +# To simplify the code, we treat RCDATA and CDATA sections the same since the +# differences between them don't affect the context we are in. +state(name = 'cdata_text', + external = 'text', + transitions = [ + ['<', 'cdata_lt'], + ['default', 'cdata_text'] + ]) + +# Possible beginning of the closing tag. +state(name = 'cdata_lt', + external = 'text', + transitions = [ + ['/', 'cdata_may_close'], + ['!', 'cdata_comment_start'], + ['default', 'cdata_text'] + ]) + +# If we encounter </tag where tag matches the last opened tag, we exit the +# CDATA section. Part of this logic is handled in the code. +state(name = 'cdata_may_close', + external = 'text', + transitions = [ + ['idtag', 'cdata_may_close'], + ['>', 'text'], + ['space', 'tag_space'], + ['default', 'cdata_text'] + ]) + +# The next states are used for specialized parser modes. +state(name = 'js_file', + external = 'js_file', + transitions = [ + ['default', 'js_file'] + ]) + +# TODO(falmeida): Having css_file and js_file as the external name doesn't make +# sense. This should instead be text and the js/css state be +# returned by # in_js() and in_css(). +state(name = 'css_file', + external = 'css_file', + transitions = [ + ['default', 'css_file'] + ]) + +state(name = 'null', + external = 'text', + transitions = [ + ['default', 'null'] + ]) + |