diff options
author | Yorhel <git@yorhel.nl> | 2014-08-16 16:20:54 +0200 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2014-08-16 16:20:54 +0200 |
commit | aa9f253bf60e92f8f0ad53e216a81c538091968e (patch) | |
tree | 1b0ddf1afc6a688acfbacfb2b0bf891a0fda4720 | |
parent | 31e505ca87059dd62a3ab20e8f3f635ee66f0837 (diff) |
Add UTF-8 validation to ypc_val_parse()
-rw-r--r-- | Makefile.am | 4 | ||||
-rw-r--r-- | lib/util/utf8.c | 23 | ||||
-rw-r--r-- | lib/util/utf8.h | 28 | ||||
-rw-r--r-- | lib/val/ypc_val_parse.c | 12 | ||||
-rw-r--r-- | lib/ypc.h | 1 | ||||
-rw-r--r-- | test/val_parse.c | 7 |
6 files changed, 68 insertions, 7 deletions
diff --git a/Makefile.am b/Makefile.am index 2d14a29..9d2e6f3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -27,6 +27,7 @@ libypc_la_SOURCES=\ lib/util/str2sockaddr_ip.c\ lib/util/str2sockaddr_port.c\ lib/util/tweetnacl.c\ + lib/util/utf8.c\ lib/val/ypc_val_parse.c\ lib/ypc_init.c\ lib/ypc_msg_free.c\ @@ -35,7 +36,8 @@ libypc_la_SOURCES=\ EXTRA_DIST=\ lib/internal.h\ lib/util/str2sockaddr.h\ - lib/util/tweetnacl.h + lib/util/tweetnacl.h\ + lib/util/utf8.h include_HEADERS=lib/ypc.h diff --git a/lib/util/utf8.c b/lib/util/utf8.c new file mode 100644 index 0000000..983e9d5 --- /dev/null +++ b/lib/util/utf8.c @@ -0,0 +1,23 @@ +/* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> + * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + * + * Also see utf8.h */ +#include <stdint.h> + +const uint8_t ypc__utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + diff --git a/lib/util/utf8.h b/lib/util/utf8.h new file mode 100644 index 0000000..b284c14 --- /dev/null +++ b/lib/util/utf8.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> + * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + * + * Minor modifications for ypc: + * - Split into .c/.h files + * - Renamed global symbols + * - Reduced state to a uint8_t (possibly making things slower, but reducing + * the parsing state of serialized values) + */ + +#include <stdint.h> + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +extern const uint8_t ypc__utf8d[]; + +static inline uint32_t ypc__utf8_decode(uint8_t* state, uint32_t* codep, uint32_t byte) { + uint32_t type = ypc__utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = ypc__utf8d[256 + *state + type]; + return *state; +} + diff --git a/lib/val/ypc_val_parse.c b/lib/val/ypc_val_parse.c index 22124b0..223f776 100644 --- a/lib/val/ypc_val_parse.c +++ b/lib/val/ypc_val_parse.c @@ -1,4 +1,5 @@ #include "../internal.h" +#include "../util/utf8.h" typedef enum { YPCP_TYPE, /* Expecting a type byte */ @@ -140,9 +141,13 @@ YPC_EXPORT int ypc_val_parse(ypc_val_parser *p, const uint8_t *buf, size_t *lenp break; case YPCP_TEXT: - /* TODO: Validate UTF-8 */ - if(*buf == 0) - r = ypc__parse_endval(p); + { + uint32_t codepoint; + if(ypc__utf8_decode(&p->utf8state, &codepoint, *buf) == UTF8_REJECT) + r = -1; + else if(*buf == 0) + r = p->utf8state == UTF8_ACCEPT ? ypc__parse_endval(p) : -1; + } break; } @@ -161,5 +166,6 @@ YPC_EXPORT void ypc_val_parse_init(ypc_val_parser *p) { p->depth = 0; p->wantval = 0; p->state = YPCP_TYPE; + p->utf8state = UTF8_ACCEPT; } @@ -46,6 +46,7 @@ typedef struct { uint32_t depth : 6; uint32_t state : 3; bool wantval : 1; + uint8_t utf8state; uint32_t pad; } ypc_val_parser; diff --git a/test/val_parse.c b/test/val_parse.c index 9b72e00..a577063 100644 --- a/test/val_parse.c +++ b/test/val_parse.c @@ -34,6 +34,7 @@ } while(0) int main(int argc, char **argv) { + assert(sizeof(ypc_val_parser) == 16); /* null */ T("\0", 1); /* false */ @@ -58,9 +59,9 @@ int main(int argc, char **argv) { T("\11Hey\0", 1); T("\11オリジナルサウンドトラック\0", 1); T("\11\xC2\xA2 \xE2\x82\xAC \xF0\xA4\xAD\xA2\0", 1); /* UTF-8 examples from Wikipedia */ - T("\11\xC2\0", 1); /* XXX: WRONG! */ - T("\11\xC0\x80\0", 1); /* XXX: WRONG! */ - T("\11\xF0\x82\xAC\0", 1); /* XXX: WRONG! Overlong (Example from wikipedia) */ + T("\11\xC2\0", -1); + T("\11\xC0\x80\0", -1); + T("\11\xF0\x82\xAC\0", -1); /* Overlong (Example from wikipedia) */ /* Arrays */ T("\12\0\14", 1); T("\12\0\1\11abc\0\14", 1); |