# Copyright (c) 2013-2014 Yoran Heling # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission notice shall be included # in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # Format of this file (informal): # # Line = State Desc (';' Desc)* # Desc = Cond Act* Next # Cond = FunctionName # yxml_isFunctionName(char) # | '$' Varname # match character in Varname # | C-char ('|' C-char)* # Act = FunctionName # yxml_FunctionName(x, char) # | '$' Varname # Store current char into Varname # | '@' State # Remember given state as future next state # | "string" # consume string before moving to next state # Next = State # Go to the given state # | '@' # Go to a previously remembered state # # Basically, it's just a short notation for manually writing a DFA. The script # that compiles this to C is pretty simple and stupid, which explains the # somewhat crude syntax of this file. It'd probably be more convenient to # modify ragel[1] to generate state machine code that can be used in the # yxml_parse() API, but I haven't really looked into that yet. I'm also not # sure how much control I'd lose over the size of the resulting state machine. # # 1. http://www.complang.org/ragel/ # # Note that the '@' state remembering functionality and "string" consuming # action use the same variable to store the next state. This means that string # consuming should not be used when the last @ state still needs to be # remembered. init '\xef' "\xbb\xbf" misc0; SP misc0; '<' le0 # State numbers for the misc/le/lee/leq states: # 0 = before XMLDecl, (prolog) # 1 = before first element, (prolog/misc) # 2 = inside element (content) # 3 = after root element (misc) # And naming: # misc = Nothing special seen yet # le = Seen '<' # lee = Seen '' misc1 # VersionInfo, after 'version', returns to xmldecl4 ver0 SP ver0; '=' ver1 ver1 SP ver1; '\''|'"' $quote "1." ver2 ver2 Num ver3; ver3 Num ver3; $quote xmldecl4 # EncodingDecl, after 'e', returns to xmldecl6 # TODO: Pass the encoding value to the application? enc0 SP enc0; '=' enc1 enc1 SP enc1; '\''|'"' $quote enc2 enc2 Alpha enc3 enc3 EncName enc3; $quote xmldecl6 # SDDecl, after 'standalone', returns to xmldecl8 # TODO: Pass the standalone flag to the application? std0 SP std0; '=' std1 std1 SP std1; '\''|'"' $quote std2 std2 'y' "es" std3; 'n' "o" std3 std3 $quote xmldecl8 # Comment, after '' @ # PI, starting from '' pivalend @; Char datapi2 pi2 pi4 '>' pivalend @ # CDSect, starting from '' misc2; Char datacd2 cd0 # Doctype, starting from ' > # # Only the last '>' is correctly recognized as the end of the declaration. # Any other '>' found to end a tag/PI/comment, or found within quotes, # comments or a PI, is ignored. # TODO: This still fails on conditional sections, which may nest. dt0 '>' misc1; '\''|'"' $quote @dt0 dt1; '<' dt2; Char dt0 dt1 $quote @; Char dt1 dt2 '?' @dt0 pi0; '!' dt3 dt3 '-' @dt0 comment1; Char dt4 dt4 '\''|'"' $quote @dt4 dt1; '>' dt0; Char dt4 # End tag, after ' tag. The only difference is that # with the code below, will result in ESYN, whereas a non-validating # version would give ECLOSE. etag0 NameStart elemclose etag1 etag1 Name elemclose etag1; SP elemcloseend etag2; '>' elemcloseend misc2 etag2 SP etag2; '>' misc2 # Element, after '' elemnameend misc2 elem1 SP elem1; '/' elem3; '>' misc2; NameStart attrstart attr0 elem2 SP elem1; '/' elem3; '>' misc2 elem3 '>' selfclose misc2 # Attribute, after NameStart, returns to elem2 attr0 Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2 attr1 SP attr1; '=' attr2 attr2 SP attr2; '\''|'"' $quote attr3 attr3 AttValue dataattr attr3; '&' refstart attr4; $quote attrvalend elem2 attr4 Ref ref attr4; '\x3b' refattrval attr3