summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2013-05-26 11:23:02 +0200
committerYorhel <git@yorhel.nl>2013-05-26 11:29:23 +0200
commit8aa2e407c7b357248c9a7cbaf124f26f5d327170 (patch)
tree1a46d1fb705ecba06924606da0de8386845023fa
parent33b664657033beb33d050c2ef5a6cb1d6b77adff (diff)
yuri: Rewrite query string parser to be smaller and more flexible
This is a much better API. You can only iterate once over a single string, but multiple iterations with the previous API wasn't going to be fast either way, so might as well enfore a string copy if you want that. This new API allows more not-so-very-well-formed query strings, too. Such as multiple ";&;&" sequences and empty/absent keys/values. Usually the goal is merely to extract information out of a URI, in which case such oddities can be safely ignored. In case the application wishes to disallow or validate such URIs, the new API can still be used for that but requires a bit more work.
-rw-r--r--test/yuri.c53
-rw-r--r--yuri.c76
-rw-r--r--yuri.h73
3 files changed, 76 insertions, 126 deletions
diff --git a/test/yuri.c b/test/yuri.c
index 5f6246e..955e0a3 100644
--- a/test/yuri.c
+++ b/test/yuri.c
@@ -237,62 +237,49 @@ static void t_parse() {
-#define F(s) do {\
- char *buf = strdup(s);\
- yuri_query_t q;\
- assert(yuri_query_parse(buf, &q) == -1);\
- free(buf);\
- } while(0)
-
#define T(s, ...) do {\
char *buf = strdup(s);\
char *args[] = {__VA_ARGS__};\
- char *key, *value;\
+ char *key, *value, *str = buf;\
size_t i;\
- yuri_query_t q;\
- assert(yuri_query_parse(buf, &q) == 0);\
- assert(q.n == sizeof(args)/sizeof(*args)/2);\
- assert(q.next == buf);\
for(i=0; i<sizeof(args)/sizeof(*args); i+=2) {\
- assert(yuri_query_next(&q, &key, &value) == 1);\
+ assert(yuri_query_parse(&str, &key, &value) == 1);\
assert(strcmp(key, args[i]) == 0);\
assert(strcmp(value, args[i+1]) == 0);\
}\
- assert(yuri_query_next(&q, &key, &value) == 0);\
+ assert(yuri_query_parse(&str, &key, &value) == 0);\
free(buf);\
} while(0)
static void t_query() {
{ /* Should handle NULL */
- yuri_query_t q;
- char *key, *value;
- assert(yuri_query_parse(NULL, &q) == 0);
- assert(q.n == 0 && q.next == NULL);
- assert(yuri_query_next(&q, &key, &value) == 0);
+ char *buf = NULL, *key, *value;
+ assert(yuri_query_parse(&buf, &key, &value) == 0);
}
- F("a");
- F("abc=");
- F("=abc");
- F("a=b;a");
- F("a=b;a=");
- F("a=b;=a");
- F("&");
- F(";");
- F("&abc=val");
- F("abc&k=v");
- F("ab=&k=v");
- F("a=b&&k=v");
- F("a=b;;k=v");
T("",);
+ T("a", "a", "");
T("k=v", "k", "v");
T("key=value", "key", "value");
T("%20=%6a", "\x20", "\x6a");
T("k=v;k=v&k=v", "k", "v", "k", "v", "k", "v");
T("a+b=b+a", "a b", "b a");
+ T("key=value1=value2", "key", "value1=value2");
+ T("====", "", "==="); /* Query strings can be odd... */
+ T("abc=", "abc", "");
+ T("=abc", "", "abc");
+ T("a=b;a", "a", "b", "a", "");
+ T("a=b;a=", "a", "b", "a", "");
+ T("a=b;=a", "a", "b", "", "a");
+ T("&", "", "");
+ T(";", "", "");
+ T("&abc=val", "", "", "abc", "val");
+ T("abc&k=v", "abc", "", "k", "v");
+ T("ab=&k=v", "ab", "", "k", "v");
+ T("a=b&&k=v", "a", "b", "", "", "k", "v");
+ T("a=b;;k=v", "a", "b", "", "", "k", "v");
}
-#undef F
#undef T
diff --git a/yuri.c b/yuri.c
index 9c2f7f5..27626f6 100644
--- a/yuri.c
+++ b/yuri.c
@@ -255,8 +255,7 @@ char *yuri_unescape(char *str) {
/* Special unescape function for the query string. Differs from yuri_unescape()
- * in that it converts '+' to a space and that it zeros out any bytes that
- * remain if the string has shrunk (necessary for yuri_query_next()). */
+ * in that it also converts '+' to a space. */
static char *yuri__query_unescape(char *str) {
unsigned char *src = (unsigned char *)str, *dest = (unsigned char *)str;
while(*src) {
@@ -272,61 +271,38 @@ static char *yuri__query_unescape(char *str) {
*(dest++) = (y_hexval(src[1])<<4) | y_hexval(src[2]);
src += 3;
}
- while(dest <= src)
- *(dest++) = 0;
+ *dest = 0;
return str;
}
-int yuri_query_parse(char *str, yuri_query_t *q) {
- q->n = 0;
- q->next = str;
- if(!str)
+int yuri_query_parse(char **str, char **key, char **value) {
+ if(!str || !*str || !**str)
return 0;
- char *sep;
- while(*str) {
- /* Key */
- sep = str;
- while(*sep && *sep != '=' && *sep != ';' && *sep != '&')
- sep++;
- if(!*sep || *sep == ';' || *sep == '&' || sep == str)
- return -1;
- *(sep++) = 0;
- yuri__query_unescape(str);
- str = sep;
-
- /* Value */
- while(*sep && *sep != ';' && *sep != '&')
- sep++;
- if(sep == str)
- return -1;
- if(*sep)
- *(sep++) = 0;
- yuri__query_unescape(str);
- q->n++;
- str = sep;
+ /* Key */
+ char *sep = *str;
+ while(*sep && *sep != '=' && *sep != ';' && *sep != '&')
+ sep++;
+ if(!*sep || *sep == ';' || *sep == '&') { /* No value */
+ *key = *str;
+ *value = sep;
+ *str = *sep ? sep+1 : sep;
+ *sep = 0;
+ yuri__query_unescape(*key);
+ return 1;
}
-
- return 0;
-}
-
-
-static void yuri__query_advance(yuri_query_t *q) {
- q->next += strlen(q->next);
- while(!*q->next)
- q->next++;
-}
-
-
-int yuri_query_next(yuri_query_t *q, char **key, char **value) {
- if(!q->n)
- return 0;
- *key = q->next;
- yuri__query_advance(q);
- *value = q->next;
- if(--q->n)
- yuri__query_advance(q);
+ *(sep++) = 0;
+ *key = *str;
+ yuri__query_unescape(*key);
+
+ /* Value */
+ *value = sep;
+ while(*sep && *sep != ';' && *sep != '&')
+ sep++;
+ *str = *sep ? sep+1 : sep;
+ *sep = 0;
+ yuri__query_unescape(*value);
return 1;
}
diff --git a/yuri.h b/yuri.h
index fe0cc2d..77b8ccb 100644
--- a/yuri.h
+++ b/yuri.h
@@ -87,7 +87,12 @@ typedef struct {
* yuri_parse(), or a newly created buffer in the case of
* yuri_parse_copy(). */
char *buf;
- /* All the pointers below point into the *buf memory. */
+ /* All the pointers below point into the *buf memory.
+ *
+ * TODO: Instead of setting these pointers to NULL if the part is absent,
+ * doesn't it make sense to just set them to an empty string? Is the
+ * differentiation between "absent" and "present but empty" really useful?
+ */
/* NULL if there was no scheme in the URI. Uppercase characters (A-Z) are
* automatically converted to lowercase (a-z). */
@@ -118,7 +123,7 @@ typedef struct {
* encoding rules. If you just want their unescaped string representation,
* you can always use yuri_unescape() on these fields. If you know that the
* query string is in key=value format (most common), use the
- * yuri_query_parse() to parse it. */
+ * yuri_query_parse() function to parse it. */
char *path;
char *query;
char *fragment;
@@ -178,59 +183,41 @@ char *yuri_unescape(char *str);
/* Simple query string parser. Parses both "a=b&c=d", "a=b;c=d" and a mixture
- * of the two styles. This API is used as follows:
+ * of the two styles. This function is used as follows:
*
* yuri_t uri;
- * yuri_query_t q;
- * if(yuri_parse(str, &uri) || yuri_query_parse(uri->query, &q))
+ * if(yuri_parse(str, &uri))
* // handle error
*
* char *key, *value;
- * while(yuri_query_next(&q, &key, &value)) {
- * // Do something
+ * while(yuri_query_parse(&uri.query, &key, &value)) {
+ * // Do something with key and value
* }
- */
-typedef struct {
- char *next; /* Pointer to the next key returned by _next. */
- size_t n; /* Number of key/value pairs left */
-} yuri_query_t;
-
-
-/* Parses and validates a key=value-style query string. The given string is
- * modified in-place. On success, it writes an iterator into the given
- * yuri_query_t object and returns 0. Returns -1 on failure, after which both
- * str and q may have been modified and may contain garbage. str may be NULL,
- * in which case is it considered equivalent to an empty string, which is an
- * empty query string with 0 key/value pairs.
*
- * This function only returns an error on things like empty keys ("=abc"),
- * empty pairs ("&&"), empty values ("key=") or absent values ("key"). The '+'
- * character is converted to a space in both keys and values. The same key may
- * appear multiple times. TODO: Add support absent or empty values.
+ * This function takes a pointer to a query string buffer as argument, parses
+ * one key/value pair, stores pointers into this buffer in *key and *value, and
+ * advances the *str pointer to the next pair, or to the end of the string if
+ * there is no next pair. Returns 1 if a key/value pair has been extracted, 0
+ * if !*str || !**str.
+ *
+ * The given *str is modified in-place. If you wish to re-use the query string
+ * later on or want to iterate multiple times over the same query string, you
+ * need to make a copy of the string (e.g. with strdup()) and iterate over
+ * that.
+ *
+ * The strings returned in *key and *value are unescaped, as in
+ * yuri_unescape(). Additionally, the '+' character is converted into a space
+ * as well. Both the key and value can be set to an empty string. This happens
+ * for empty pairs ("&&" or "&=&"), empty keys ("=abc") or empty/absent values
+ * ("abc" or "abc=").
*
* IMPORTANT: The given string is assumed to contain valid URI escapes, as in
* yuri_validate_escape(), so run that function first if the string comes from
* an untrusted source.
*
- * IMPORTANT#2: You should only call this function on the same string once, and
- * you should not have called yuri_unescape() on that string (unless, of
- * course, you want to parse a query string encoded inside a query string, or
- * whatever such scheme you may use).
- *
- * You can re-use the same iterator multiple times by making a copy of the
- * struct before calling yuri_query_next() for the first time. */
-int yuri_query_parse(char *str, yuri_query_t *q);
-
-
-/* Get the next key/value pair from the yuri_query_t iterator. Returns 0 if
- * there are no more pairs, 1 otherwise.
- *
- * The keys and values are returned in their unescaped form, so no further
- * calls to yuri_unescape() are necessary. The key and value strings point into
- * the string buffer given to yuri_query_parse().
- *
- * The IMPORTANT#3 note of yuri_unescape() applies here, too. */
-int yuri_query_next(yuri_query_t *q, char **key, char **value);
+ * The IMPORTANT#3 note of yuri_unescape() applies here, too.
+ */
+int yuri_query_parse(char **str, char **key, char **value);
#endif