diff options
author | Yorhel <git@yorhel.nl> | 2013-05-26 11:23:02 +0200 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2013-05-26 11:29:23 +0200 |
commit | 8aa2e407c7b357248c9a7cbaf124f26f5d327170 (patch) | |
tree | 1a46d1fb705ecba06924606da0de8386845023fa | |
parent | 33b664657033beb33d050c2ef5a6cb1d6b77adff (diff) |
yuri: Rewrite query string parser to be smaller and more flexible
This is a much better API. You can only iterate once over a single
string, but multiple iterations with the previous API wasn't going to be
fast either way, so might as well enfore a string copy if you want that.
This new API allows more not-so-very-well-formed query strings, too.
Such as multiple ";&;&" sequences and empty/absent keys/values. Usually
the goal is merely to extract information out of a URI, in which case
such oddities can be safely ignored. In case the application wishes to
disallow or validate such URIs, the new API can still be used for that
but requires a bit more work.
-rw-r--r-- | test/yuri.c | 53 | ||||
-rw-r--r-- | yuri.c | 76 | ||||
-rw-r--r-- | yuri.h | 73 |
3 files changed, 76 insertions, 126 deletions
diff --git a/test/yuri.c b/test/yuri.c index 5f6246e..955e0a3 100644 --- a/test/yuri.c +++ b/test/yuri.c @@ -237,62 +237,49 @@ static void t_parse() { -#define F(s) do {\ - char *buf = strdup(s);\ - yuri_query_t q;\ - assert(yuri_query_parse(buf, &q) == -1);\ - free(buf);\ - } while(0) - #define T(s, ...) do {\ char *buf = strdup(s);\ char *args[] = {__VA_ARGS__};\ - char *key, *value;\ + char *key, *value, *str = buf;\ size_t i;\ - yuri_query_t q;\ - assert(yuri_query_parse(buf, &q) == 0);\ - assert(q.n == sizeof(args)/sizeof(*args)/2);\ - assert(q.next == buf);\ for(i=0; i<sizeof(args)/sizeof(*args); i+=2) {\ - assert(yuri_query_next(&q, &key, &value) == 1);\ + assert(yuri_query_parse(&str, &key, &value) == 1);\ assert(strcmp(key, args[i]) == 0);\ assert(strcmp(value, args[i+1]) == 0);\ }\ - assert(yuri_query_next(&q, &key, &value) == 0);\ + assert(yuri_query_parse(&str, &key, &value) == 0);\ free(buf);\ } while(0) static void t_query() { { /* Should handle NULL */ - yuri_query_t q; - char *key, *value; - assert(yuri_query_parse(NULL, &q) == 0); - assert(q.n == 0 && q.next == NULL); - assert(yuri_query_next(&q, &key, &value) == 0); + char *buf = NULL, *key, *value; + assert(yuri_query_parse(&buf, &key, &value) == 0); } - F("a"); - F("abc="); - F("=abc"); - F("a=b;a"); - F("a=b;a="); - F("a=b;=a"); - F("&"); - F(";"); - F("&abc=val"); - F("abc&k=v"); - F("ab=&k=v"); - F("a=b&&k=v"); - F("a=b;;k=v"); T("",); + T("a", "a", ""); T("k=v", "k", "v"); T("key=value", "key", "value"); T("%20=%6a", "\x20", "\x6a"); T("k=v;k=v&k=v", "k", "v", "k", "v", "k", "v"); T("a+b=b+a", "a b", "b a"); + T("key=value1=value2", "key", "value1=value2"); + T("====", "", "==="); /* Query strings can be odd... */ + T("abc=", "abc", ""); + T("=abc", "", "abc"); + T("a=b;a", "a", "b", "a", ""); + T("a=b;a=", "a", "b", "a", ""); + T("a=b;=a", "a", "b", "", "a"); + T("&", "", ""); + T(";", "", ""); + T("&abc=val", "", "", "abc", "val"); + T("abc&k=v", "abc", "", "k", "v"); + T("ab=&k=v", "ab", "", "k", "v"); + T("a=b&&k=v", "a", "b", "", "", "k", "v"); + T("a=b;;k=v", "a", "b", "", "", "k", "v"); } -#undef F #undef T @@ -255,8 +255,7 @@ char *yuri_unescape(char *str) { /* Special unescape function for the query string. Differs from yuri_unescape() - * in that it converts '+' to a space and that it zeros out any bytes that - * remain if the string has shrunk (necessary for yuri_query_next()). */ + * in that it also converts '+' to a space. */ static char *yuri__query_unescape(char *str) { unsigned char *src = (unsigned char *)str, *dest = (unsigned char *)str; while(*src) { @@ -272,61 +271,38 @@ static char *yuri__query_unescape(char *str) { *(dest++) = (y_hexval(src[1])<<4) | y_hexval(src[2]); src += 3; } - while(dest <= src) - *(dest++) = 0; + *dest = 0; return str; } -int yuri_query_parse(char *str, yuri_query_t *q) { - q->n = 0; - q->next = str; - if(!str) +int yuri_query_parse(char **str, char **key, char **value) { + if(!str || !*str || !**str) return 0; - char *sep; - while(*str) { - /* Key */ - sep = str; - while(*sep && *sep != '=' && *sep != ';' && *sep != '&') - sep++; - if(!*sep || *sep == ';' || *sep == '&' || sep == str) - return -1; - *(sep++) = 0; - yuri__query_unescape(str); - str = sep; - - /* Value */ - while(*sep && *sep != ';' && *sep != '&') - sep++; - if(sep == str) - return -1; - if(*sep) - *(sep++) = 0; - yuri__query_unescape(str); - q->n++; - str = sep; + /* Key */ + char *sep = *str; + while(*sep && *sep != '=' && *sep != ';' && *sep != '&') + sep++; + if(!*sep || *sep == ';' || *sep == '&') { /* No value */ + *key = *str; + *value = sep; + *str = *sep ? sep+1 : sep; + *sep = 0; + yuri__query_unescape(*key); + return 1; } - - return 0; -} - - -static void yuri__query_advance(yuri_query_t *q) { - q->next += strlen(q->next); - while(!*q->next) - q->next++; -} - - -int yuri_query_next(yuri_query_t *q, char **key, char **value) { - if(!q->n) - return 0; - *key = q->next; - yuri__query_advance(q); - *value = q->next; - if(--q->n) - yuri__query_advance(q); + *(sep++) = 0; + *key = *str; + yuri__query_unescape(*key); + + /* Value */ + *value = sep; + while(*sep && *sep != ';' && *sep != '&') + sep++; + *str = *sep ? sep+1 : sep; + *sep = 0; + yuri__query_unescape(*value); return 1; } @@ -87,7 +87,12 @@ typedef struct { * yuri_parse(), or a newly created buffer in the case of * yuri_parse_copy(). */ char *buf; - /* All the pointers below point into the *buf memory. */ + /* All the pointers below point into the *buf memory. + * + * TODO: Instead of setting these pointers to NULL if the part is absent, + * doesn't it make sense to just set them to an empty string? Is the + * differentiation between "absent" and "present but empty" really useful? + */ /* NULL if there was no scheme in the URI. Uppercase characters (A-Z) are * automatically converted to lowercase (a-z). */ @@ -118,7 +123,7 @@ typedef struct { * encoding rules. If you just want their unescaped string representation, * you can always use yuri_unescape() on these fields. If you know that the * query string is in key=value format (most common), use the - * yuri_query_parse() to parse it. */ + * yuri_query_parse() function to parse it. */ char *path; char *query; char *fragment; @@ -178,59 +183,41 @@ char *yuri_unescape(char *str); /* Simple query string parser. Parses both "a=b&c=d", "a=b;c=d" and a mixture - * of the two styles. This API is used as follows: + * of the two styles. This function is used as follows: * * yuri_t uri; - * yuri_query_t q; - * if(yuri_parse(str, &uri) || yuri_query_parse(uri->query, &q)) + * if(yuri_parse(str, &uri)) * // handle error * * char *key, *value; - * while(yuri_query_next(&q, &key, &value)) { - * // Do something + * while(yuri_query_parse(&uri.query, &key, &value)) { + * // Do something with key and value * } - */ -typedef struct { - char *next; /* Pointer to the next key returned by _next. */ - size_t n; /* Number of key/value pairs left */ -} yuri_query_t; - - -/* Parses and validates a key=value-style query string. The given string is - * modified in-place. On success, it writes an iterator into the given - * yuri_query_t object and returns 0. Returns -1 on failure, after which both - * str and q may have been modified and may contain garbage. str may be NULL, - * in which case is it considered equivalent to an empty string, which is an - * empty query string with 0 key/value pairs. * - * This function only returns an error on things like empty keys ("=abc"), - * empty pairs ("&&"), empty values ("key=") or absent values ("key"). The '+' - * character is converted to a space in both keys and values. The same key may - * appear multiple times. TODO: Add support absent or empty values. + * This function takes a pointer to a query string buffer as argument, parses + * one key/value pair, stores pointers into this buffer in *key and *value, and + * advances the *str pointer to the next pair, or to the end of the string if + * there is no next pair. Returns 1 if a key/value pair has been extracted, 0 + * if !*str || !**str. + * + * The given *str is modified in-place. If you wish to re-use the query string + * later on or want to iterate multiple times over the same query string, you + * need to make a copy of the string (e.g. with strdup()) and iterate over + * that. + * + * The strings returned in *key and *value are unescaped, as in + * yuri_unescape(). Additionally, the '+' character is converted into a space + * as well. Both the key and value can be set to an empty string. This happens + * for empty pairs ("&&" or "&=&"), empty keys ("=abc") or empty/absent values + * ("abc" or "abc="). * * IMPORTANT: The given string is assumed to contain valid URI escapes, as in * yuri_validate_escape(), so run that function first if the string comes from * an untrusted source. * - * IMPORTANT#2: You should only call this function on the same string once, and - * you should not have called yuri_unescape() on that string (unless, of - * course, you want to parse a query string encoded inside a query string, or - * whatever such scheme you may use). - * - * You can re-use the same iterator multiple times by making a copy of the - * struct before calling yuri_query_next() for the first time. */ -int yuri_query_parse(char *str, yuri_query_t *q); - - -/* Get the next key/value pair from the yuri_query_t iterator. Returns 0 if - * there are no more pairs, 1 otherwise. - * - * The keys and values are returned in their unescaped form, so no further - * calls to yuri_unescape() are necessary. The key and value strings point into - * the string buffer given to yuri_query_parse(). - * - * The IMPORTANT#3 note of yuri_unescape() applies here, too. */ -int yuri_query_next(yuri_query_t *q, char **key, char **value); + * The IMPORTANT#3 note of yuri_unescape() applies here, too. + */ +int yuri_query_parse(char **str, char **key, char **value); #endif |