summaryrefslogtreecommitdiff
path: root/yxml-states
blob: eeede4a30ab12b8092c52f21c28b42642d93b8ca (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#  Copyright (c) 2013 Yoran Heling
#
#  Permission is hereby granted, free of charge, to any person obtaining
#  a copy of this software and associated documentation files (the
#  "Software"), to deal in the Software without restriction, including
#  without limitation the rights to use, copy, modify, merge, publish,
#  distribute, sublicense, and/or sell copies of the Software, and to
#  permit persons to whom the Software is furnished to do so, subject to
#  the following conditions:
#
#  The above copyright notice and this permission notice shall be included
#  in all copies or substantial portions of the Software.
#
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
#  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
#  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
#  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


# Format of this file (informal):
#
#  Line = State Desc (';' Desc)*
#  Desc = Cond Act* Next
#  Cond = FunctionName              # yxml_isFunctionName(char)
#       | '$' Varname               # match character in Varname
#       | C-char ('|' C-char)*
#  Act  = FunctionName              # yxml_FunctionName(x, char)
#       | '$' Varname               # Store current char into Varname
#       | "string"                  # consume string before moving to next state
#  Next = State
#
# Basically, it's just a short notation for manually writing a DFA. The script
# that compiles this to C is pretty simple and stupid, which explains the
# somewhat crude syntax of this file. It'd probably be more convenient to
# modify ragel[1] to generate state machine code that can be used in the
# yxml_parse() API, but I haven't really looked into that yet. I'm also not
# sure how much control I'd lose over the size of the resulting state machine.
#
# 1. http://www.complang.org/ragel/

init        '\xef' "\xbb\xbf" misc0; SP misc0; '<' le0

# State numbers for the misc/le/lee/leq states:
#   0 = before XMLDecl, (prolog)
#   1 = before first element, (prolog/misc)
#   2 = inside element (content)
# And naming:
#   misc = Nothing special seen yet
#   le   = Seen '<'
#   lee  = Seen '<!'
#   leq  = Seen '<?'
misc0      SP misc0; '<' le0
misc1      SP misc1; '<' le1
misc2      '<' le2; '&' refstart misc2a; Char setdata misc2
misc2a     Ref ref misc2a; '\x3b' refend misc2

le0        '!' lee1; '?' leq0; NameStart elemstart elem0
le1        '!' lee1; '?' pi0; NameStart elemstart elem0
le2        '!' lee2; '?' pi0; '/' etag0; NameStart elemstart elem0

lee1       '-' comment0; 'D' "OCTYPE" dt0
lee2       '-' comment0; '[' "CDATA[" cd0
leq0       'x' "ml" xmldecl0; NameStart pi1


# XMLDecl, starting from '<?xml', returns to misc1
xmldecl0    SP xmldecl1
xmldecl1    SP xmldecl1; 'v' "ersion" ver0
xmldecl2    SP xmldecl3; '?' xmldecl7
xmldecl3    SP xmldecl3; '?' xmldecl7; 'e' "ncoding" enc0; 's' std0
xmldecl4    SP xmldecl5; '?' xmldecl7
xmldecl5    SP xmldecl5; '?' xmldecl7; 's' "tandalone" std0
xmldecl6    SP xmldecl6; '?' xmldecl7
xmldecl7    '>' misc1

# VersionInfo, after 'version', returns to xmldecl2
ver0       SP ver0; '=' ver1
ver1       SP ver1; '\''|'"' $quote "1." ver2
ver2       Num ver3;
ver3       Num ver3; $quote xmldecl2

# EncodingDecl, after 'e', returns to xmldecl4
# TODO: Pass the encoding value to the application?
enc0       SP enc0; '=' enc1
enc1       SP enc1; '\''|'"' $quote enc2
enc2       Alpha enc3
enc3       EncName enc3; $quote xmldecl4

# SDDecl, after 'standalone', returns to xmldecl6
# TODO: Pass the standalone flag to the application?
std0       SP std0; '=' std1
std1       SP std1; '\''|'"' $quote std2
std2       'y' "es" std3; 'n' "o" std3
std3       $quote xmldecl6


# Comment, after '<!-', returns to misc1 or misc2
comment0   '-' comment1
comment1   CommentStart comment2
comment2   '-' comment3; Char comment2
comment3   '-' comment4; Char comment2
comment4   '>' retmisc comment4


# PI, starting from '<?', returns to misc1 or misc2
# TODO: Verify that the PI name isn't /xml/i
# TODO: Pass the name and contents to the application
pi0        NameStart pi1
pi1        Name pi1; SP pi2
pi2        '?' pi3; Char pi2
pi3        '>' retmisc pi3; Char pi2


# CDSect, starting from '<![DATA[', returns to misc2
cd0        ']' cd1; Char setdata cd0
cd1        ']' cd2; Char setdata cd0
cd2        '>' misc2


# Doctype, starting from '<!DOCTYPE', returns to misc1
# TODO: This is a hack, all we do is read until we find a '>', not
#   validating its content. This hack fails if the DTD contains a '>'
#   character, which is very possible. Unfortunately, just figuring out where a
#   DTD ends already requires a rather elaborate parser. :-(
dt0        '>' misc1; Char dt0


# End tag, after '</', returns to misc2
# XXX: It's not actually necessary to validate the characters, since the
#   'elemclose' function already verifies (incrementally) that the name is
#   equivalent to the corresponding <Open ..> tag. The only difference is that
#   with the code below, </x/z> will result in ESYN, whereas a non-validating
#   version would give ECLOSE.
etag0      NameStart elemclose etag1
etag1      Name elemclose etag1; SP elemcloseend etag2; '>' elemcloseend misc2
etag2      SP etag2; '>' misc2


# Element, after '<X', returns to misc2
elem0      Name elemname elem0; SP elemnameend elem1; '/' elemnameend attrsend elem3; '>' elemnameend attrsend misc2
elem1      SP elem1; '/' attrsend elem3; '>' attrsend misc2; NameStart attrstart attr0
elem2      SP elem1; '/' attrsend elem3; '>' attrsend misc2
elem3      '>' selfclose misc2

# Attribute, after NameStart, returns to elem2
attr0      Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2
attr1      SP attr1; '=' attr2
attr2      SP attr2; '\''|'"' $quote attr3
attr3      AttValue setdata attr3; '&' refstart attr4; $quote elem2
attr4      Ref ref attr4; '\x3b' refend attr3