# Copyright 2008 Google Inc. All Rights Reserved. # Author: falmeida@google.com (Filipe Almeida) # TODO(falmeida): Add more descriptive names to the states and drop the # abbreviations. # TODO(falmeida): Reorder the states so that it's easier to read. # TODO(falmeida): Support CDATA blocks in the form: ', '>') condition('=', '=') # TODO(falmeida): This is not the correct expression. tag and attribute names # can only consist of alpha character. condition('id', 'A-Za-z0-9_:-') condition('idtag', 'A-Za-z0-9/_:-') # Whitespace according to: http://www.w3.org/TR/html401/struct/text.html#h-9.1 condition('space', ' \t\n\r') condition('!', '!') condition('q', '\'') condition('dq', '\"') condition('/', '/') condition('*', '*') condition('-', '-') condition('?', '?') condition('lf', '\n') condition('quote', '\\') # TODO(falmeida): This default rule is a hack and shouldn't be here. condition('default', '[:default:]') state(name = 'text', external = 'text', transitions = [ ['<', 'tag_start'], ['default', 'text'] ]) # When we found the < character in text. # Tag opening is defined in the HTML5 draft here: # http://www.whatwg.org/specs/web-apps/current-work/#tag-open-state # We don't exactly follow this and are much more loose in order to mimic the way # the major browsers behave. state(name = 'tag_start', external = 'tag', transitions = [ ['idtag', 'tag_name'], ['?', 'pi'], ['!', 'declaration_start'], ['<', 'tag_start'], ['default', 'text'] ]) # Name of the tag. Includes the closing tag character '/'. state(name = 'tag_name', external = 'tag', transitions = [ ['idtag', 'tag_name'], ['space', 'tag_space'], ['>', 'tag_close'] ]) # HTML declaration and comment parsing # # We don't expose declaration state because at this point we only want to # ensure that we are parsing them correctly so we don't get out of sync. # This is specifically made for DOCTYPE declarations and won't work if DTD's # are defined inside the declaration. # The HTML5 spec says we should specificly look for the string '', 'text'], ['default', 'declaration_body'] ]) # Inside a declaration. Ie: ' state(name = 'declaration_body', external = 'text', transitions = [ ['>', 'text'], ['default', 'declaration_body'] ]) # Got '' state(name = 'comment_body', external = 'comment', transitions = [ ['-', 'comment_dash'], ['default', 'comment_body'] ]) # Got '-' inside a comment. state(name = 'comment_dash', external = 'comment', transitions = [ ['-', 'comment_dash_dash'], ['default', 'comment_body'] ]) # Got '--' inside a comment. state(name = 'comment_dash_dash', external = 'comment', transitions = [ ['-', 'comment_dash_dash'], ['>', 'text'], ['default', 'comment_body'] ]) # XML Processing instruction parsing according to: # http://www.w3.org/TR/REC-xml/#sec-pi # # Everything between the characters is considered to be part of the # processing instruction. state(name = 'pi', external = 'text', transitions = [ ['?', 'pi_may_end'], ['default', 'pi'] ]) state(name = 'pi_may_end', external = 'text', transitions = [ ['>', 'text'], ['default', 'pi'] ]) # Whitespace between tag name, attributes. state(name = 'tag_space', external = 'tag', transitions = [ ['>', 'tag_close'], ['space', 'tag_space'], ['id', 'attr'], ['/', 'tag_space'] ]) state(name = 'tag_close', external = 'text', transitions = [ ['<', 'tag_start'], ['default', 'text'] ]) # Name of the attribute. state(name = 'attr', external = 'attr', transitions = [ ['id', 'attr'], ['>', 'tag_close'], ['/', 'tag_space'], ['=', 'value'], ['space', 'attr_space'] ]) # After the attribute name. state(name = 'attr_space', external = 'attr', transitions = [ ['>', 'tag_close'], ['space', 'attr_space'], ['id', 'attr'], ['/', 'tag_space'], ['=', 'value'] ]) # Expecting a value, after attribute= state(name = 'value', external = 'value', transitions = [ ['q', 'value_q_start'], ['dq', 'value_dq_start'], ['space', 'value'], ['>', 'tag_close'], ['default', 'value_text'] ]) # Unquoted attribute value. state(name = 'value_text', external = 'value', transitions = [ ['>', 'tag_close'], ['space', 'tag_space'], ['default', 'value_text'] ]) # First character of a single quoted attribute value. state(name = 'value_q_start', external = 'value', transitions = [ ['q', 'tag_space'], ['default', 'value_q'] ]) # In the middle of a single quoted attribute value. state(name = 'value_q', external = 'value', transitions = [ ['q', 'tag_space'], ['default', 'value_q'] ]) # First character of a double quoted attribute value. state(name = 'value_dq_start', external = 'value', transitions = [ ['dq', 'tag_space'], ['default', 'value_dq'] ]) # In the middle of a double quoted attribute value. state(name = 'value_dq', external = 'value', transitions = [ ['dq', 'tag_space'], ['default', 'value_dq'] ]) # CDATA escaping text spans. # TODO(falmeida): These states should go after cdata_text. # Got '', 'cdata_text'], ['default', 'cdata_comment_body'] ]) # CDATA processing # # To simplify the code, we treat RCDATA and CDATA sections the same since the # differences between them don't affect the context we are in. state(name = 'cdata_text', external = 'text', transitions = [ ['<', 'cdata_lt'], ['default', 'cdata_text'] ]) # Possible beginning of the closing tag. state(name = 'cdata_lt', external = 'text', transitions = [ ['/', 'cdata_may_close'], ['!', 'cdata_comment_start'], ['default', 'cdata_text'] ]) # If we encounter ', 'text'], ['space', 'tag_space'], ['default', 'cdata_text'] ]) # The next states are used for specialized parser modes. state(name = 'js_file', external = 'js_file', transitions = [ ['default', 'js_file'] ]) # TODO(falmeida): Having css_file and js_file as the external name doesn't make # sense. This should instead be text and the js/css state be # returned by # in_js() and in_css(). state(name = 'css_file', external = 'css_file', transitions = [ ['default', 'css_file'] ]) state(name = 'null', external = 'text', transitions = [ ['default', 'null'] ])