summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2014-08-16 16:20:54 +0200
committerYorhel <git@yorhel.nl>2014-08-16 16:20:54 +0200
commitaa9f253bf60e92f8f0ad53e216a81c538091968e (patch)
tree1b0ddf1afc6a688acfbacfb2b0bf891a0fda4720
parent31e505ca87059dd62a3ab20e8f3f635ee66f0837 (diff)
Add UTF-8 validation to ypc_val_parse()
-rw-r--r--Makefile.am4
-rw-r--r--lib/util/utf8.c23
-rw-r--r--lib/util/utf8.h28
-rw-r--r--lib/val/ypc_val_parse.c12
-rw-r--r--lib/ypc.h1
-rw-r--r--test/val_parse.c7
6 files changed, 68 insertions, 7 deletions
diff --git a/Makefile.am b/Makefile.am
index 2d14a29..9d2e6f3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -27,6 +27,7 @@ libypc_la_SOURCES=\
lib/util/str2sockaddr_ip.c\
lib/util/str2sockaddr_port.c\
lib/util/tweetnacl.c\
+ lib/util/utf8.c\
lib/val/ypc_val_parse.c\
lib/ypc_init.c\
lib/ypc_msg_free.c\
@@ -35,7 +36,8 @@ libypc_la_SOURCES=\
EXTRA_DIST=\
lib/internal.h\
lib/util/str2sockaddr.h\
- lib/util/tweetnacl.h
+ lib/util/tweetnacl.h\
+ lib/util/utf8.h
include_HEADERS=lib/ypc.h
diff --git a/lib/util/utf8.c b/lib/util/utf8.c
new file mode 100644
index 0000000..983e9d5
--- /dev/null
+++ b/lib/util/utf8.c
@@ -0,0 +1,23 @@
+/* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ *
+ * Also see utf8.h */
+#include <stdint.h>
+
+const uint8_t ypc__utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+ 12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
diff --git a/lib/util/utf8.h b/lib/util/utf8.h
new file mode 100644
index 0000000..b284c14
--- /dev/null
+++ b/lib/util/utf8.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ *
+ * Minor modifications for ypc:
+ * - Split into .c/.h files
+ * - Renamed global symbols
+ * - Reduced state to a uint8_t (possibly making things slower, but reducing
+ * the parsing state of serialized values)
+ */
+
+#include <stdint.h>
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+extern const uint8_t ypc__utf8d[];
+
+static inline uint32_t ypc__utf8_decode(uint8_t* state, uint32_t* codep, uint32_t byte) {
+ uint32_t type = ypc__utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ *state = ypc__utf8d[256 + *state + type];
+ return *state;
+}
+
diff --git a/lib/val/ypc_val_parse.c b/lib/val/ypc_val_parse.c
index 22124b0..223f776 100644
--- a/lib/val/ypc_val_parse.c
+++ b/lib/val/ypc_val_parse.c
@@ -1,4 +1,5 @@
#include "../internal.h"
+#include "../util/utf8.h"
typedef enum {
YPCP_TYPE, /* Expecting a type byte */
@@ -140,9 +141,13 @@ YPC_EXPORT int ypc_val_parse(ypc_val_parser *p, const uint8_t *buf, size_t *lenp
break;
case YPCP_TEXT:
- /* TODO: Validate UTF-8 */
- if(*buf == 0)
- r = ypc__parse_endval(p);
+ {
+ uint32_t codepoint;
+ if(ypc__utf8_decode(&p->utf8state, &codepoint, *buf) == UTF8_REJECT)
+ r = -1;
+ else if(*buf == 0)
+ r = p->utf8state == UTF8_ACCEPT ? ypc__parse_endval(p) : -1;
+ }
break;
}
@@ -161,5 +166,6 @@ YPC_EXPORT void ypc_val_parse_init(ypc_val_parser *p) {
p->depth = 0;
p->wantval = 0;
p->state = YPCP_TYPE;
+ p->utf8state = UTF8_ACCEPT;
}
diff --git a/lib/ypc.h b/lib/ypc.h
index 310e99e..de2b7be 100644
--- a/lib/ypc.h
+++ b/lib/ypc.h
@@ -46,6 +46,7 @@ typedef struct {
uint32_t depth : 6;
uint32_t state : 3;
bool wantval : 1;
+ uint8_t utf8state;
uint32_t pad;
} ypc_val_parser;
diff --git a/test/val_parse.c b/test/val_parse.c
index 9b72e00..a577063 100644
--- a/test/val_parse.c
+++ b/test/val_parse.c
@@ -34,6 +34,7 @@
} while(0)
int main(int argc, char **argv) {
+ assert(sizeof(ypc_val_parser) == 16);
/* null */
T("\0", 1);
/* false */
@@ -58,9 +59,9 @@ int main(int argc, char **argv) {
T("\11Hey\0", 1);
T("\11オリジナルサウンドトラック\0", 1);
T("\11\xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\0", 1); /* UTF-8 examples from Wikipedia */
- T("\11\xC2\0", 1); /* XXX: WRONG! */
- T("\11\xC0\x80\0", 1); /* XXX: WRONG! */
- T("\11\xF0\x82\xAC\0", 1); /* XXX: WRONG! Overlong (Example from wikipedia) */
+ T("\11\xC2\0", -1);
+ T("\11\xC0\x80\0", -1);
+ T("\11\xF0\x82\xAC\0", -1); /* Overlong (Example from wikipedia) */
/* Arrays */
T("\12\0\14", 1);
T("\12\0\1\11abc\0\14", 1);