Normalize end-of-line sequences to a single '\n'

Decreases performance a bit and increases the size a bit. But, well, correctness is important. :-(
author: Yorhel <git@yorhel.nl> 2013-09-04 16:38:48 +0200
committer: Yorhel <git@yorhel.nl> 2013-09-04 16:44:10 +0200
commit: 32eea7065c9a4e8af8d318620b338569b96bcb5f (patch)
tree: b82e321aecbb039cd0d401b51e1ff037c92a413e
parent: 3a714480dd363d96f85cb468098e108775a043eb (diff)
3 files changed, 29 insertions, 10 deletions
diff --git a/yxml.c b/yxml.c
index 0425c8d..6d3f825 100644
--- a/yxml.c
+++ b/yxml.c
@@ -87,7 +87,8 @@ typedef enum {
 
 
 #define yxml_isChar(c) 1
-#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0d || c == 0x0a)
+/* 0xd should be part of SP, too, but yxml_parse() already normalizes that into 0xa */
+#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0a)
 #define yxml_isAlpha(c) ((c|32)-'a' < 26)
 #define yxml_isNum(c) (c-'0' < 10)
 #define yxml_isHex(c) (yxml_isNum(c) || (c|32)-'a' < 6)
@@ -280,15 +281,23 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
 	unsigned ch = (unsigned)(_ch+256) & 0xff;
 	if(!ch)
 		return YXML_ESYN;
+	x->total++;
 
-	/* TODO: Validate UTF-8 correctness? */
-
-	if(ch == '\n') {
+	/* End-of-Line normalization, "\rX", "\r\n" and "\n" are recognized and
+	 * normalized to a single '\n' as per XML 1.0 section 2.11. XML 1.1 adds
+	 * some non-ASCII character sequences to this list, but we can only handle
+	 * ASCII here without making assumptions about the input encoding. */
+	if(x->ignore == ch) {
+		x->ignore = 0;
+		return YXML_OK;
+	}
+	x->ignore = (ch == 0xd) * 0xa;
+	if(ch == 0xa || ch == 0xd) {
+		ch = 0xa;
 		x->line++;
 		x->byte = 0;
 	}
 	x->byte++;
-	x->total++;
 
 	switch((yxml_state_t)x->state) {
 	case YXMLS_string:
diff --git a/yxml.c.in b/yxml.c.in
index e51ab08..f08b913 100644
--- a/yxml.c.in
+++ b/yxml.c.in
@@ -30,7 +30,8 @@ typedef enum {
 
 
 #define yxml_isChar(c) 1
-#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0d || c == 0x0a)
+/* 0xd should be part of SP, too, but yxml_parse() already normalizes that into 0xa */
+#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0a)
 #define yxml_isAlpha(c) ((c|32)-'a' < 26)
 #define yxml_isNum(c) (c-'0' < 10)
 #define yxml_isHex(c) (yxml_isNum(c) || (c|32)-'a' < 6)
@@ -223,15 +224,23 @@ yxml_ret_t yxml_parse(yxml_t *x, int _ch) {
 	unsigned ch = (unsigned)(_ch+256) & 0xff;
 	if(!ch)
 		return YXML_ESYN;
+	x->total++;
 
-	/* TODO: Validate UTF-8 correctness? */
-
-	if(ch == '\n') {
+	/* End-of-Line normalization, "\rX", "\r\n" and "\n" are recognized and
+	 * normalized to a single '\n' as per XML 1.0 section 2.11. XML 1.1 adds
+	 * some non-ASCII character sequences to this list, but we can only handle
+	 * ASCII here without making assumptions about the input encoding. */
+	if(x->ignore == ch) {
+		x->ignore = 0;
+		return YXML_OK;
+	}
+	x->ignore = (ch == 0xd) * 0xa;
+	if(ch == 0xa || ch == 0xd) {
+		ch = 0xa;
 		x->line++;
 		x->byte = 0;
 	}
 	x->byte++;
-	x->total++;
 
 	switch((yxml_state_t)x->state) {
 	case YXMLS_string:
diff --git a/yxml.h b/yxml.h
index 70308ef..f8a4648 100644
--- a/yxml.h
+++ b/yxml.h
@@ -94,6 +94,7 @@ typedef struct {
 	unsigned quote;
 	int reflen;
 	int stringstate;
+	unsigned ignore;
 	unsigned char ref[YXML_MAX_REF+1];
 	unsigned char *string;
 } yxml_t;
author	Yorhel <git@yorhel.nl>	2013-09-04 16:38:48 +0200
committer	Yorhel <git@yorhel.nl>	2013-09-04 16:44:10 +0200
commit	32eea7065c9a4e8af8d318620b338569b96bcb5f (patch)
tree	b82e321aecbb039cd0d401b51e1ff037c92a413e
parent	3a714480dd363d96f85cb468098e108775a043eb (diff)