diff options
author | Yorhel <git@yorhel.nl> | 2013-09-24 11:20:31 +0200 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2013-09-24 11:24:07 +0200 |
commit | 80e73e201e68a09b399bf192d97c332df59ad980 (patch) | |
tree | db6f9d545fcdb77338e5219e2b8eda403dd80b58 | |
parent | cade07b53bfa48b2347d579ce8bd655cae432534 (diff) |
Fix returning of ']' chars within CDATA + de-generalized ?-in-PI
I thought I'd handle the ?-in-PI and ]-in-CDATA problems in a more
general solution, but realized that wasn't any simpler or smaller than
these specific solutions.
-rw-r--r-- | test/content01.out | 2 | ||||
-rw-r--r-- | test/content01.xml | 3 | ||||
-rwxr-xr-x | yxml-gen.pl | 2 | ||||
-rw-r--r-- | yxml-states | 13 | ||||
-rw-r--r-- | yxml.c | 40 | ||||
-rw-r--r-- | yxml.c.in | 30 | ||||
-rw-r--r-- | yxml.h | 7 |
7 files changed, 55 insertions, 42 deletions
diff --git a/test/content01.out b/test/content01.out index 333e1e8..572a916 100644 --- a/test/content01.out +++ b/test/content01.out @@ -11,6 +11,6 @@ elemstart refs content data ! ! elemend -data \x0aCDATA!\x0a[[CD<a/> <!-- no comment -->&<?NotaPI?>¬aref;\x0aBug: The following character *should* be parsed as a DATA token, but currently isn't: \x0aThis does work, however: ]]\x0a +data \x0aCDATA!\x0a[[CD<a/> <!-- no comment -->&<?NotaPI?>¬aref;\x0a]x]]y]]]z]]\x0a elemend ok diff --git a/test/content01.xml b/test/content01.xml index f9c74a8..4689c13 100644 --- a/test/content01.xml +++ b/test/content01.xml @@ -4,6 +4,5 @@ <refs> ! !</refs> <![CDATA[CDATA!]]> <![CDATA[[[CD<a/> <!-- no comment -->&<?NotaPI?>¬aref; -Bug: The following character *should* be parsed as a DATA token, but currently isn't: ] -This does work, however: ]]]]> +]x]]y]]]z]]]]> </a> diff --git a/yxml-gen.pl b/yxml-gen.pl index affaa84..dde243a 100755 --- a/yxml-gen.pl +++ b/yxml-gen.pl @@ -49,7 +49,7 @@ sub acttoc { # return an error code. Functions that may return an error should NOT be # called in the same state as other functions. for(@_) { - push @r, "yxml_$1(x, ch)" if /^([a-z_]+)$/; + push @r, "yxml_$1(x, ch)" if /^([a-z0-9_]+)$/; push @c, "x->$1 = ch" if /^\$(.+)$/; if(/^"/) { push @c, ( diff --git a/yxml-states b/yxml-states index c4fb8fe..acf4ec5 100644 --- a/yxml-states +++ b/yxml-states @@ -120,18 +120,15 @@ comment4 '>' @ # PI, starting from '<?', returns to @ pi0 NameStart pistart pi1 pi1 Name piname pi1; '?' pinameend pi4; SP pinameend pi2 -pi2 '?' datahold pi3; Char dataset pi2 -pi3 '>' pivalend @; Char datarelease pi2 +pi2 '?' pi3; Char dataset pi2 +pi3 '>' pivalend @; Char datapi pi2 pi4 '>' pivalend @ # CDSect, starting from '<![DATA[', returns to misc2 -# TODO: "<![CDATA[ ] ]]>" is parsed correctly, but the ']' is not sent to -# the application as a DATA token. No idea how to easily fix that. Currently, -# "<![CDATA[ ]]]]>" does work correctly. cd0 ']' cd1; Char dataset cd0 -cd1 ']' cd2; Char dataset cd0 -cd2 ']' dataset cd2; '>' misc2; Char dataset cd0 +cd1 ']' cd2; Char datacd1 cd0 +cd2 ']' dataset cd2; '>' misc2; Char datacd2 cd0 # Doctype, starting from '<!DOCTYPE', returns to misc1 @@ -173,6 +170,6 @@ elem3 '>' selfclose misc2 attr0 Name attrname attr0; SP attrnameend attr1; '=' attrnameend attr2 attr1 SP attr1; '=' attr2 attr2 SP attr2; '\''|'"' $quote attr3 -attr3 AttValue attrvalset attr3; '&' refstart attr4; $quote attrvalend elem2 +attr3 AttValue dataattr attr3; '&' refstart attr4; $quote attrvalend elem2 attr4 Ref ref attr4; '\x3b' refend attr3 @@ -128,24 +128,32 @@ static inline int yxml_dataset(yxml_t *x, unsigned ch) { } -static inline int yxml_datahold(yxml_t *x, unsigned ch) { - yxml_setchar(x->data, ch); - x->data[1] = 0; - return YXML_OK; +static inline int yxml_datapi(yxml_t *x, unsigned ch) { + x->data[0] = '?'; + yxml_setchar(x->data+1, ch); + x->data[2] = 0; + return YXML_DATA; } -static inline int yxml_datarelease(yxml_t *x, unsigned ch) { - char *r = x->data; - while(*r) - r++; - yxml_setchar(r, ch); - r[1] = 0; +static inline int yxml_datacd1(yxml_t *x, unsigned ch) { + x->data[0] = ']'; + yxml_setchar(x->data+1, ch); + x->data[2] = 0; return YXML_DATA; } -static inline int yxml_attrvalset(yxml_t *x, unsigned ch) { +static inline int yxml_datacd2(yxml_t *x, unsigned ch) { + x->data[0] = ']'; + x->data[1] = ']'; + yxml_setchar(x->data+2, ch); + x->data[3] = 0; + return YXML_DATA; +} + + +static inline int yxml_dataattr(yxml_t *x, unsigned ch) { /* Normalize attribute values according to the XML spec section 3.3.3. */ return yxml_dataset(x, ch == 0x9 || ch == 0xa ? 0x20 : ch); } @@ -357,7 +365,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) { break; case YXMLS_attr3: if(yxml_isAttValue(ch)) - return yxml_attrvalset(x, ch); + return yxml_dataattr(x, ch); if(ch == (unsigned char)'&') { x->state = YXMLS_attr4; return yxml_refstart(x, ch); @@ -390,7 +398,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) { } if(yxml_isChar(ch)) { x->state = YXMLS_cd0; - return yxml_dataset(x, ch); + return yxml_datacd1(x, ch); } break; case YXMLS_cd2: @@ -402,7 +410,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) { } if(yxml_isChar(ch)) { x->state = YXMLS_cd0; - return yxml_dataset(x, ch); + return yxml_datacd2(x, ch); } break; case YXMLS_comment0: @@ -792,7 +800,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) { case YXMLS_pi2: if(ch == (unsigned char)'?') { x->state = YXMLS_pi3; - return yxml_datahold(x, ch); + return YXML_OK; } if(yxml_isChar(ch)) return yxml_dataset(x, ch); @@ -804,7 +812,7 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) { } if(yxml_isChar(ch)) { x->state = YXMLS_pi2; - return yxml_datarelease(x, ch); + return yxml_datapi(x, ch); } break; case YXMLS_pi4: @@ -64,24 +64,32 @@ static inline int yxml_dataset(yxml_t *x, unsigned ch) { } -static inline int yxml_datahold(yxml_t *x, unsigned ch) { - yxml_setchar(x->data, ch); - x->data[1] = 0; - return YXML_OK; +static inline int yxml_datapi(yxml_t *x, unsigned ch) { + x->data[0] = '?'; + yxml_setchar(x->data+1, ch); + x->data[2] = 0; + return YXML_DATA; +} + + +static inline int yxml_datacd1(yxml_t *x, unsigned ch) { + x->data[0] = ']'; + yxml_setchar(x->data+1, ch); + x->data[2] = 0; + return YXML_DATA; } -static inline int yxml_datarelease(yxml_t *x, unsigned ch) { - char *r = x->data; - while(*r) - r++; - yxml_setchar(r, ch); - r[1] = 0; +static inline int yxml_datacd2(yxml_t *x, unsigned ch) { + x->data[0] = ']'; + x->data[1] = ']'; + yxml_setchar(x->data+2, ch); + x->data[3] = 0; return YXML_DATA; } -static inline int yxml_attrvalset(yxml_t *x, unsigned ch) { +static inline int yxml_dataattr(yxml_t *x, unsigned ch) { /* Normalize attribute values according to the XML spec section 3.3.3. */ return yxml_dataset(x, ch == 0x9 || ch == 0xa ? 0x20 : ch); } @@ -77,9 +77,10 @@ typedef struct { /* The last read character(s) of an attribute value, element data, or * processing instruction. Changed after YXML_DATA and only valid until the * next yxml_parse() call. Usually, this string only consists of a single - * character, but multiple characters may be returned in the following case: - * - "<?SomePI ?x ?>": The two characters "?x" are returned in a single - * data token. + * character, but multiple characters are returned in the following cases: + * - "<?SomePI ?x ?>": The two characters "?x" + * - "<![CDATA[ ]x ]]>": The two characters "]x" + * - "<![CDATA[ ]]x ]]>": The three characters "]]x" */ char data[8]; |