summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2013-09-24 11:20:31 +0200
committerYorhel <git@yorhel.nl>2013-09-24 11:24:07 +0200
commit80e73e201e68a09b399bf192d97c332df59ad980 (patch)
treedb6f9d545fcdb77338e5219e2b8eda403dd80b58
parentcade07b53bfa48b2347d579ce8bd655cae432534 (diff)
Fix returning of ']' chars within CDATA + de-generalized ?-in-PI
I thought I'd handle the ?-in-PI and ]-in-CDATA problems in a more general solution, but realized that wasn't any simpler or smaller than these specific solutions.
-rw-r--r--test/content01.out2
-rw-r--r--test/content01.xml3
-rwxr-xr-xyxml-gen.pl2
-rw-r--r--yxml-states13
-rw-r--r--yxml.c40
-rw-r--r--yxml.c.in30
-rw-r--r--yxml.h7
7 files changed, 55 insertions, 42 deletions
diff --git a/test/content01.out b/test/content01.out
index 333e1e8..572a916 100644
--- a/test/content01.out
+++ b/test/content01.out
@@ -11,6 +11,6 @@ elemstart refs
content
data ! !
elemend
-data \x0aCDATA!\x0a[[CD<a/> <!-- no comment -->&amp;<?NotaPI?>&notaref;\x0aBug: The following character *should* be parsed as a DATA token, but currently isn't: \x0aThis does work, however: ]]\x0a
+data \x0aCDATA!\x0a[[CD<a/> <!-- no comment -->&amp;<?NotaPI?>&notaref;\x0a]x]]y]]]z]]\x0a
elemend
ok
diff --git a/test/content01.xml b/test/content01.xml
index f9c74a8..4689c13 100644
--- a/test/content01.xml
+++ b/test/content01.xml
@@ -4,6 +4,5 @@
<refs>&#x20;&#33;&#x0020;&#0033;</refs>
<![CDATA[CDATA!]]>
<![CDATA[[[CD<a/> <!-- no comment -->&amp;<?NotaPI?>&notaref;
-Bug: The following character *should* be parsed as a DATA token, but currently isn't: ]
-This does work, however: ]]]]>
+]x]]y]]]z]]]]>
</a>
diff --git a/yxml-gen.pl b/yxml-gen.pl
index affaa84..dde243a 100755
--- a/yxml-gen.pl
+++ b/yxml-gen.pl
@@ -49,7 +49,7 @@ sub acttoc {
# return an error code. Functions that may return an error should NOT be
# called in the same state as other functions.
for(@_) {
- push @r, "yxml_$1(x, ch)" if /^([a-z_]+)$/;
+ push @r, "yxml_$1(x, ch)" if /^([a-z0-9_]+)$/;
push @c, "x->$1 = ch" if /^\$(.+)$/;
if(/^"/) {
push @c, (
diff --git a/yxml-states b/yxml-states
index c4fb8fe..acf4ec5 100644
--- a/yxml-states
+++ b/yxml-states
@@ -120,18 +120,15 @@ comment4 '>' @
# PI, starting from '<?', returns to @
pi0 NameStart pistart pi1
pi1 Name piname pi1; '?' pinameend pi4; SP pinameend pi2
-pi2 '?' datahold pi3; Char dataset pi2
-pi3 '>' pivalend @; Char datarelease pi2
+pi2 '?' pi3; Char dataset pi2
+pi3 '>' pivalend @; Char datapi pi2
pi4 '>' pivalend @
# CDSect, starting from '<![DATA[', returns to misc2
-# TODO: "<![CDATA[ ] ]]>" is parsed correctly, but the ']' is not sent to
-# the application as a DATA token. No idea how to easily fix that. Currently,
-# "<![CDATA[ ]]]]>" does work correctly.
cd0 ']' cd1; Char dataset cd0
-cd1 ']' cd2; Char dataset cd0
-cd2 ']' dataset cd2; '>' misc2; Char dataset cd0
+cd1 ']' cd2; Char datacd1 cd0
+cd2 ']' dataset cd2; '>' misc2; Char datacd2 cd0
# Doctype, starting from '<!DOCTYPE', returns to misc1
@@ -173,6 +170,6 @@ elem3 '>' selfclose misc2
attr0 Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2
attr1 SP attr1; '=' attr2
attr2 SP attr2; '\''|'"' $quote attr3
-attr3 AttValue attrvalset attr3; '&' refstart attr4; $quote attrvalend elem2
+attr3 AttValue dataattr attr3; '&' refstart attr4; $quote attrvalend elem2
attr4 Ref ref attr4; '\x3b' refend attr3
diff --git a/yxml.c b/yxml.c
index 5bafe38..c451816 100644
--- a/yxml.c
+++ b/yxml.c
@@ -128,24 +128,32 @@ static inline int yxml_dataset(yxml_t *x, unsigned ch) {
}
-static inline int yxml_datahold(yxml_t *x, unsigned ch) {
- yxml_setchar(x->data, ch);
- x->data[1] = 0;
- return YXML_OK;
+static inline int yxml_datapi(yxml_t *x, unsigned ch) {
+ x->data[0] = '?';
+ yxml_setchar(x->data+1, ch);
+ x->data[2] = 0;
+ return YXML_DATA;
}
-static inline int yxml_datarelease(yxml_t *x, unsigned ch) {
- char *r = x->data;
- while(*r)
- r++;
- yxml_setchar(r, ch);
- r[1] = 0;
+static inline int yxml_datacd1(yxml_t *x, unsigned ch) {
+ x->data[0] = ']';
+ yxml_setchar(x->data+1, ch);
+ x->data[2] = 0;
return YXML_DATA;
}
-static inline int yxml_attrvalset(yxml_t *x, unsigned ch) {
+static inline int yxml_datacd2(yxml_t *x, unsigned ch) {
+ x->data[0] = ']';
+ x->data[1] = ']';
+ yxml_setchar(x->data+2, ch);
+ x->data[3] = 0;
+ return YXML_DATA;
+}
+
+
+static inline int yxml_dataattr(yxml_t *x, unsigned ch) {
/* Normalize attribute values according to the XML spec section 3.3.3. */
return yxml_dataset(x, ch == 0x9 || ch == 0xa ? 0x20 : ch);
}
@@ -357,7 +365,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
break;
case YXMLS_attr3:
if(yxml_isAttValue(ch))
- return yxml_attrvalset(x, ch);
+ return yxml_dataattr(x, ch);
if(ch == (unsigned char)'&') {
x->state = YXMLS_attr4;
return yxml_refstart(x, ch);
@@ -390,7 +398,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
}
if(yxml_isChar(ch)) {
x->state = YXMLS_cd0;
- return yxml_dataset(x, ch);
+ return yxml_datacd1(x, ch);
}
break;
case YXMLS_cd2:
@@ -402,7 +410,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
}
if(yxml_isChar(ch)) {
x->state = YXMLS_cd0;
- return yxml_dataset(x, ch);
+ return yxml_datacd2(x, ch);
}
break;
case YXMLS_comment0:
@@ -792,7 +800,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
case YXMLS_pi2:
if(ch == (unsigned char)'?') {
x->state = YXMLS_pi3;
- return yxml_datahold(x, ch);
+ return YXML_OK;
}
if(yxml_isChar(ch))
return yxml_dataset(x, ch);
@@ -804,7 +812,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
}
if(yxml_isChar(ch)) {
x->state = YXMLS_pi2;
- return yxml_datarelease(x, ch);
+ return yxml_datapi(x, ch);
}
break;
case YXMLS_pi4:
diff --git a/yxml.c.in b/yxml.c.in
index cc68f50..fe30728 100644
--- a/yxml.c.in
+++ b/yxml.c.in
@@ -64,24 +64,32 @@ static inline int yxml_dataset(yxml_t *x, unsigned ch) {
}
-static inline int yxml_datahold(yxml_t *x, unsigned ch) {
- yxml_setchar(x->data, ch);
- x->data[1] = 0;
- return YXML_OK;
+static inline int yxml_datapi(yxml_t *x, unsigned ch) {
+ x->data[0] = '?';
+ yxml_setchar(x->data+1, ch);
+ x->data[2] = 0;
+ return YXML_DATA;
+}
+
+
+static inline int yxml_datacd1(yxml_t *x, unsigned ch) {
+ x->data[0] = ']';
+ yxml_setchar(x->data+1, ch);
+ x->data[2] = 0;
+ return YXML_DATA;
}
-static inline int yxml_datarelease(yxml_t *x, unsigned ch) {
- char *r = x->data;
- while(*r)
- r++;
- yxml_setchar(r, ch);
- r[1] = 0;
+static inline int yxml_datacd2(yxml_t *x, unsigned ch) {
+ x->data[0] = ']';
+ x->data[1] = ']';
+ yxml_setchar(x->data+2, ch);
+ x->data[3] = 0;
return YXML_DATA;
}
-static inline int yxml_attrvalset(yxml_t *x, unsigned ch) {
+static inline int yxml_dataattr(yxml_t *x, unsigned ch) {
/* Normalize attribute values according to the XML spec section 3.3.3. */
return yxml_dataset(x, ch == 0x9 || ch == 0xa ? 0x20 : ch);
}
diff --git a/yxml.h b/yxml.h
index 8b87d0d..ec22db5 100644
--- a/yxml.h
+++ b/yxml.h
@@ -77,9 +77,10 @@ typedef struct {
/* The last read character(s) of an attribute value, element data, or
* processing instruction. Changed after YXML_DATA and only valid until the
* next yxml_parse() call. Usually, this string only consists of a single
- * character, but multiple characters may be returned in the following case:
- * - "<?SomePI ?x ?>": The two characters "?x" are returned in a single
- * data token.
+ * character, but multiple characters are returned in the following cases:
+ * - "<?SomePI ?x ?>": The two characters "?x"
+ * - "<![CDATA[ ]x ]]>": The two characters "]x"
+ * - "<![CDATA[ ]]x ]]>": The three characters "]]x"
*/
char data[8];