summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2013-09-04 16:38:48 +0200
committerYorhel <git@yorhel.nl>2013-09-04 16:44:10 +0200
commit32eea7065c9a4e8af8d318620b338569b96bcb5f (patch)
treeb82e321aecbb039cd0d401b51e1ff037c92a413e
parent3a714480dd363d96f85cb468098e108775a043eb (diff)
Normalize end-of-line sequences to a single '\n'
Decreases performance a bit and increases the size a bit. But, well, correctness is important. :-(
-rw-r--r--yxml.c19
-rw-r--r--yxml.c.in19
-rw-r--r--yxml.h1
3 files changed, 29 insertions, 10 deletions
diff --git a/yxml.c b/yxml.c
index 0425c8d..6d3f825 100644
--- a/yxml.c
+++ b/yxml.c
@@ -87,7 +87,8 @@ typedef enum {
#define yxml_isChar(c) 1
-#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0d || c == 0x0a)
+/* 0xd should be part of SP, too, but yxml_parse() already normalizes that into 0xa */
+#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0a)
#define yxml_isAlpha(c) ((c|32)-'a' < 26)
#define yxml_isNum(c) (c-'0' < 10)
#define yxml_isHex(c) (yxml_isNum(c) || (c|32)-'a' < 6)
@@ -280,15 +281,23 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
unsigned ch = (unsigned)(_ch+256) & 0xff;
if(!ch)
return YXML_ESYN;
+ x->total++;
- /* TODO: Validate UTF-8 correctness? */
-
- if(ch == '\n') {
+ /* End-of-Line normalization, "\rX", "\r\n" and "\n" are recognized and
+ * normalized to a single '\n' as per XML 1.0 section 2.11. XML 1.1 adds
+ * some non-ASCII character sequences to this list, but we can only handle
+ * ASCII here without making assumptions about the input encoding. */
+ if(x->ignore == ch) {
+ x->ignore = 0;
+ return YXML_OK;
+ }
+ x->ignore = (ch == 0xd) * 0xa;
+ if(ch == 0xa || ch == 0xd) {
+ ch = 0xa;
x->line++;
x->byte = 0;
}
x->byte++;
- x->total++;
switch((yxml_state_t)x->state) {
case YXMLS_string:
diff --git a/yxml.c.in b/yxml.c.in
index e51ab08..f08b913 100644
--- a/yxml.c.in
+++ b/yxml.c.in
@@ -30,7 +30,8 @@ typedef enum {
#define yxml_isChar(c) 1
-#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0d || c == 0x0a)
+/* 0xd should be part of SP, too, but yxml_parse() already normalizes that into 0xa */
+#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0a)
#define yxml_isAlpha(c) ((c|32)-'a' < 26)
#define yxml_isNum(c) (c-'0' < 10)
#define yxml_isHex(c) (yxml_isNum(c) || (c|32)-'a' < 6)
@@ -223,15 +224,23 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
unsigned ch = (unsigned)(_ch+256) & 0xff;
if(!ch)
return YXML_ESYN;
+ x->total++;
- /* TODO: Validate UTF-8 correctness? */
-
- if(ch == '\n') {
+ /* End-of-Line normalization, "\rX", "\r\n" and "\n" are recognized and
+ * normalized to a single '\n' as per XML 1.0 section 2.11. XML 1.1 adds
+ * some non-ASCII character sequences to this list, but we can only handle
+ * ASCII here without making assumptions about the input encoding. */
+ if(x->ignore == ch) {
+ x->ignore = 0;
+ return YXML_OK;
+ }
+ x->ignore = (ch == 0xd) * 0xa;
+ if(ch == 0xa || ch == 0xd) {
+ ch = 0xa;
x->line++;
x->byte = 0;
}
x->byte++;
- x->total++;
switch((yxml_state_t)x->state) {
case YXMLS_string:
diff --git a/yxml.h b/yxml.h
index 70308ef..f8a4648 100644
--- a/yxml.h
+++ b/yxml.h
@@ -94,6 +94,7 @@ typedef struct {
unsigned quote;
int reflen;
int stringstate;
+ unsigned ignore;
unsigned char ref[YXML_MAX_REF+1];
unsigned char *string;
} yxml_t;