yuri: Rewrite query string parser to be smaller and more flexible

This is a much better API. You can only iterate once over a single string, but multiple iterations with the previous API wasn't going to be fast either way, so might as well enfore a string copy if you want that. This new API allows more not-so-very-well-formed query strings, too. Such as multiple ";&;&" sequences and empty/absent keys/values. Usually the goal is merely to extract information out of a URI, in which case such oddities can be safely ignored. In case the application wishes to disallow or validate such URIs, the new API can still be used for that but requires a bit more work.
author: Yorhel <git@yorhel.nl> 2013-05-26 11:23:02 +0200
committer: Yorhel <git@yorhel.nl> 2013-05-26 11:29:23 +0200
commit: 8aa2e407c7b357248c9a7cbaf124f26f5d327170 (patch)
tree: 1a46d1fb705ecba06924606da0de8386845023fa
parent: 33b664657033beb33d050c2ef5a6cb1d6b77adff (diff)
3 files changed, 76 insertions, 126 deletions
diff --git a/test/yuri.c b/test/yuri.c
index 5f6246e..955e0a3 100644
--- a/test/yuri.c
+++ b/test/yuri.c
@@ -237,62 +237,49 @@ static void t_parse() {
 
 
 
-#define F(s) do {\
-		char *buf = strdup(s);\
-		yuri_query_t q;\
-		assert(yuri_query_parse(buf, &q) == -1);\
-		free(buf);\
-	} while(0)
-
 #define T(s, ...) do {\
 		char *buf = strdup(s);\
 		char *args[] = {__VA_ARGS__};\
-		char *key, *value;\
+		char *key, *value, *str = buf;\
 		size_t i;\
-		yuri_query_t q;\
-		assert(yuri_query_parse(buf, &q) == 0);\
-		assert(q.n == sizeof(args)/sizeof(*args)/2);\
-		assert(q.next == buf);\
 		for(i=0; i<sizeof(args)/sizeof(*args); i+=2) {\
-			assert(yuri_query_next(&q, &key, &value) == 1);\
+			assert(yuri_query_parse(&str, &key, &value) == 1);\
 			assert(strcmp(key, args[i]) == 0);\
 			assert(strcmp(value, args[i+1]) == 0);\
 		}\
-		assert(yuri_query_next(&q, &key, &value) == 0);\
+		assert(yuri_query_parse(&str, &key, &value) == 0);\
 		free(buf);\
 	} while(0)
 
 static void t_query() {
 	{ /* Should handle NULL */
-		yuri_query_t q;
-		char *key, *value;
-		assert(yuri_query_parse(NULL, &q) == 0);
-		assert(q.n == 0 && q.next == NULL);
-		assert(yuri_query_next(&q, &key, &value) == 0);
+		char *buf = NULL, *key, *value;
+		assert(yuri_query_parse(&buf, &key, &value) == 0);
 	}
 
-	F("a");
-	F("abc=");
-	F("=abc");
-	F("a=b;a");
-	F("a=b;a=");
-	F("a=b;=a");
-	F("&");
-	F(";");
-	F("&abc=val");
-	F("abc&k=v");
-	F("ab=&k=v");
-	F("a=b&&k=v");
-	F("a=b;;k=v");
 	T("",);
+	T("a", "a", "");
 	T("k=v", "k", "v");
 	T("key=value", "key", "value");
 	T("%20=%6a", "\x20", "\x6a");
 	T("k=v;k=v&k=v", "k", "v", "k", "v", "k", "v");
 	T("a+b=b+a", "a b", "b a");
+	T("key=value1=value2", "key", "value1=value2");
+	T("====", "", "==="); /* Query strings can be odd... */
+	T("abc=", "abc", "");
+	T("=abc", "", "abc");
+	T("a=b;a",  "a", "b", "a", "");
+	T("a=b;a=", "a", "b", "a", "");
+	T("a=b;=a", "a", "b", "", "a");
+	T("&", "", "");
+	T(";", "", "");
+	T("&abc=val", "", "", "abc", "val");
+	T("abc&k=v", "abc", "", "k", "v");
+	T("ab=&k=v", "ab", "", "k", "v");
+	T("a=b&&k=v", "a", "b", "", "", "k", "v");
+	T("a=b;;k=v", "a", "b", "", "", "k", "v");
 }
 
-#undef F
 #undef T
 
 
diff --git a/yuri.c b/yuri.c
index 9c2f7f5..27626f6 100644
--- a/yuri.c
+++ b/yuri.c
@@ -255,8 +255,7 @@ char *yuri_unescape(char *str) {
 
 
 /* Special unescape function for the query string. Differs from yuri_unescape()
- * in that it converts '+' to a space and that it zeros out any bytes that
- * remain if the string has shrunk (necessary for yuri_query_next()). */
+ * in that it also converts '+' to a space. */
 static char *yuri__query_unescape(char *str) {
 	unsigned char *src = (unsigned char *)str, *dest = (unsigned char *)str;
 	while(*src) {
@@ -272,61 +271,38 @@ static char *yuri__query_unescape(char *str) {
 		*(dest++) = (y_hexval(src[1])<<4) | y_hexval(src[2]);
 		src += 3;
 	}
-	while(dest <= src)
-		*(dest++) = 0;
+	*dest = 0;
 	return str;
 }
 
 
-int yuri_query_parse(char *str, yuri_query_t *q) {
-	q->n = 0;
-	q->next = str;
-	if(!str)
+int yuri_query_parse(char **str, char **key, char **value) {
+	if(!str || !*str || !**str)
 		return 0;
 
-	char *sep;
-	while(*str) {
-		/* Key */
-		sep = str;
-		while(*sep && *sep != '=' && *sep != ';' && *sep != '&')
-			sep++;
-		if(!*sep || *sep == ';' || *sep == '&' || sep == str)
-			return -1;
-		*(sep++) = 0;
-		yuri__query_unescape(str);
-		str = sep;
-
-		/* Value */
-		while(*sep && *sep != ';' && *sep != '&')
-			sep++;
-		if(sep == str)
-			return -1;
-		if(*sep)
-			*(sep++) = 0;
-		yuri__query_unescape(str);
-		q->n++;
-		str = sep;
+	/* Key */
+	char *sep = *str;
+	while(*sep && *sep != '=' && *sep != ';' && *sep != '&')
+		sep++;
+	if(!*sep || *sep == ';' || *sep == '&') { /* No value */
+		*key = *str;
+		*value = sep;
+		*str = *sep ? sep+1 : sep;
+		*sep = 0;
+		yuri__query_unescape(*key);
+		return 1;
 	}
-
-	return 0;
-}
-
-
-static void yuri__query_advance(yuri_query_t *q) {
-	q->next += strlen(q->next);
-	while(!*q->next)
-		q->next++;
-}
-
-
-int yuri_query_next(yuri_query_t *q, char **key, char **value) {
-	if(!q->n)
-		return 0;
-	*key = q->next;
-	yuri__query_advance(q);
-	*value = q->next;
-	if(--q->n)
-		yuri__query_advance(q);
+	*(sep++) = 0;
+	*key = *str;
+	yuri__query_unescape(*key);
+
+	/* Value */
+	*value = sep;
+	while(*sep && *sep != ';' && *sep != '&')
+		sep++;
+	*str = *sep ? sep+1 : sep;
+	*sep = 0;
+	yuri__query_unescape(*value);
 	return 1;
 }
 
diff --git a/yuri.h b/yuri.h
index fe0cc2d..77b8ccb 100644
--- a/yuri.h
+++ b/yuri.h
@@ -87,7 +87,12 @@ typedef struct {
 	 * yuri_parse(), or a newly created buffer in the case of
 	 * yuri_parse_copy(). */
 	char *buf;
-	/* All the pointers below point into the *buf memory. */
+	/* All the pointers below point into the *buf memory.
+	 *
+	 * TODO: Instead of setting these pointers to NULL if the part is absent,
+	 * doesn't it make sense to just set them to an empty string? Is the
+	 * differentiation between "absent" and "present but empty" really useful?
+	 */
 
 	/* NULL if there was no scheme in the URI. Uppercase characters (A-Z) are
 	 * automatically converted to lowercase (a-z). */
@@ -118,7 +123,7 @@ typedef struct {
 	 * encoding rules. If you just want their unescaped string representation,
 	 * you can always use yuri_unescape() on these fields. If you know that the
 	 * query string is in key=value format (most common), use the
-	 * yuri_query_parse() to parse it. */
+	 * yuri_query_parse() function to parse it. */
 	char *path;
 	char *query;
 	char *fragment;
@@ -178,59 +183,41 @@ char *yuri_unescape(char *str);
 
 
 /* Simple query string parser. Parses both "a=b&c=d", "a=b;c=d" and a mixture
- * of the two styles. This API is used as follows:
+ * of the two styles. This function is used as follows:
  *
  *   yuri_t uri;
- *   yuri_query_t q;
- *   if(yuri_parse(str, &uri) || yuri_query_parse(uri->query, &q))
+ *   if(yuri_parse(str, &uri))
  *     // handle error
  *
  *   char *key, *value;
- *   while(yuri_query_next(&q, &key, &value)) {
- *     // Do something
+ *   while(yuri_query_parse(&uri.query, &key, &value)) {
+ *     // Do something with key and value
  *   }
- */
-typedef struct {
-	char *next; /* Pointer to the next key returned by _next. */
-	size_t n;   /* Number of key/value pairs left */
-} yuri_query_t;
-
-
-/* Parses and validates a key=value-style query string. The given string is
- * modified in-place. On success, it writes an iterator into the given
- * yuri_query_t object and returns 0. Returns -1 on failure, after which both
- * str and q may have been modified and may contain garbage. str may be NULL,
- * in which case is it considered equivalent to an empty string, which is an
- * empty query string with 0 key/value pairs.
  *
- * This function only returns an error on things like empty keys ("=abc"),
- * empty pairs ("&&"), empty values ("key=") or absent values ("key"). The '+'
- * character is converted to a space in both keys and values. The same key may
- * appear multiple times.  TODO: Add support absent or empty values.
+ * This function takes a pointer to a query string buffer as argument, parses
+ * one key/value pair, stores pointers into this buffer in *key and *value, and
+ * advances the *str pointer to the next pair, or to the end of the string if
+ * there is no next pair. Returns 1 if a key/value pair has been extracted, 0
+ * if !*str || !**str.
+ *
+ * The given *str is modified in-place. If you wish to re-use the query string
+ * later on or want to iterate multiple times over the same query string, you
+ * need to make a copy of the string (e.g. with strdup()) and iterate over
+ * that.
+ *
+ * The strings returned in *key and *value are unescaped, as in
+ * yuri_unescape(). Additionally, the '+' character is converted into a space
+ * as well. Both the key and value can be set to an empty string. This happens
+ * for empty pairs ("&&" or "&=&"), empty keys ("=abc") or empty/absent values
+ * ("abc" or "abc=").
  *
  * IMPORTANT: The given string is assumed to contain valid URI escapes, as in
  * yuri_validate_escape(), so run that function first if the string comes from
  * an untrusted source.
  *
- * IMPORTANT#2: You should only call this function on the same string once, and
- * you should not have called yuri_unescape() on that string (unless, of
- * course, you want to parse a query string encoded inside a query string, or
- * whatever such scheme you may use).
- *
- * You can re-use the same iterator multiple times by making a copy of the
- * struct before calling yuri_query_next() for the first time. */
-int yuri_query_parse(char *str, yuri_query_t *q);
-
-
-/* Get the next key/value pair from the yuri_query_t iterator. Returns 0 if
- * there are no more pairs, 1 otherwise.
- *
- * The keys and values are returned in their unescaped form, so no further
- * calls to yuri_unescape() are necessary. The key and value strings point into
- * the string buffer given to yuri_query_parse().
- *
- * The IMPORTANT#3 note of yuri_unescape() applies here, too. */
-int yuri_query_next(yuri_query_t *q, char **key, char **value);
+ * The IMPORTANT#3 note of yuri_unescape() applies here, too.
+ */
+int yuri_query_parse(char **str, char **key, char **value);
 
 #endif
author	Yorhel <git@yorhel.nl>	2013-05-26 11:23:02 +0200
committer	Yorhel <git@yorhel.nl>	2013-05-26 11:29:23 +0200
commit	8aa2e407c7b357248c9a7cbaf124f26f5d327170 (patch)
tree	1a46d1fb705ecba06924606da0de8386845023fa
parent	33b664657033beb33d050c2ef5a6cb1d6b77adff (diff)