yuri: Partial rewrite; -IP validation; +path/query/fragment parsing

It's now using inet_pton() to validate IP addresses. The main reason that I previously used my own validation functions was because inet_pton() requires the string to be zero-terminated, which wasn't the case with the previous length-restricted validation code. (Though I realize now that I could have just copied the hostname into the embedded yuri_t buffer and then used inet_pton() on that. That idea must have slipped my mind when I wrote it...) The new implementation simplifies things a bit by using in-place string modification. In order to handle arbitrarily-sized path/query/fragment components, you have to either use in-place modification or use malloc(). I decided to go for the former because it's more flexible; can easily be used with malloc(), too. I'm not too fond of the new query string parsing API, but it'll do for now. I'll see if I can think of a better API and then rewrite it again later on. Note that this is an API and ABI incompatible change. If the previous yuri code worked for you, stay with that. If you want to use the new features provided by this version, make sure to check every call to yuri_parse() to point to memory that may be modified, and make sure to check that yuri_t.scheme may be NULL. Any other changes should be detected by the compiler.
author: Yorhel <git@yorhel.nl> 2013-05-24 17:23:07 +0200
committer: Yorhel <git@yorhel.nl> 2013-05-24 17:59:26 +0200
commit: 33b664657033beb33d050c2ef5a6cb1d6b77adff (patch)
tree: 6c30a14323a1b26d9d78f9d139ea666c7f92dba3
parent: 1506844fabef3a698f6c7454cac932149fb075bf (diff)
4 files changed, 687 insertions, 360 deletions
diff --git a/test/Makefile b/test/Makefile
index 0edb0db..609826e 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -6,7 +6,7 @@ CFLAGS:=-Wall -Wextra -Wno-unused-parameter -O3 -g
 all: test
 
 yuri: ../yuri.c ../yuri.h yuri.c
-	$(CC) $(CFLAGS) -I.. yuri.c -o yuri
+	$(CC) $(CFLAGS) -I.. ../yuri.c yuri.c -o yuri
 
 ecbuf: ../ecbuf.h ecbuf.c
 	$(CC) $(CFLAGS) -I.. ecbuf.c -o ecbuf
@@ -15,7 +15,7 @@ evtp: ../evtp.c ../evtp.h evtp.c
 	$(CC) $(CFLAGS) -I.. ../evtp.c evtp.c -lpthread -lev -o evtp
 
 sqlasync: ../sqlasync.c ../sqlasync.h sqlasync.c
-	$(CC) $(CFLAGS) -I.. ../sqlasync.c sqlasync.c -lpthread -lsqlite3 -o sqlasync
+	$(CC) $(CFLAGS) -I.. ../sqlasync.c sqlasync.c -lrt -lpthread -lsqlite3 -o sqlasync
 
 test: yuri ecbuf evtp sqlasync
 	./yuri
diff --git a/test/yuri.c b/test/yuri.c
index 91c004c..5f6246e 100644
--- a/test/yuri.c
+++ b/test/yuri.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012 Yoran Heling
+/* Copyright (c) 2012-2013 Yoran Heling
 
   Permission is hereby granted, free of charge, to any person obtaining
   a copy of this software and associated documentation files (the
@@ -24,213 +24,313 @@
 #error These tests should not be compiled with -DNDEBUG!
 #endif
 
-/* Include yuri.c directly, since we're testing some static functions */
-#include "../yuri.c"
-
+#include <yuri.h>
 #include <assert.h>
 #include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 
-int main(int argc, char **argv) {
-	yuri_t addr;
-	char *str;
-
-	/* yuri__scheme() */
-#define F(s)\
-		str = s;\
-		*addr.scheme = 0;\
-		assert(yuri__scheme(str, &addr) == str);\
-		assert(!*addr.scheme);
-	F("");
-	F("a");
-	F("http");
-	F("http:");
-	F("http:/");
-	F("://");
-	F("9abc://");
-	F(".://");
-	F("abcdefghijklmnop://");
-	F("abc_d://");
-	F("abc/d://");
-#undef F
-#define T(i,o)\
-		str = i;\
-		*addr.scheme = 0;\
-		assert(yuri__scheme(str, &addr) == str + 2 + sizeof o);\
-		assert(strcmp(addr.scheme, o) == 0);
-	T("http://", "http");
-	T("abcdefghijklmno://", "abcdefghijklmno");
-	T("ADC+adCs://", "adc+adcs");
-	T("x://", "x");
-	T("x.://", "x.");
-	T("a.b+C://", "a.b+c");
-	T("http://abc", "http");
-#undef T
+static int streq(const char *a, const char *b) {
+	return (!a && !b) || (a && b && strcmp(a, b) == 0);
+}
 
-	/* yuri__port() */
-#define F(s)\
-		str = s;\
-		addr.port = 0;\
-		assert(yuri__port(str, str-1+sizeof s, &addr) == str-1+sizeof s);\
-		assert(addr.port == 0);
-	F("");
-	F("1");
-	F("15");
-	F(":");
-	F(":0");
-	F(":012");
-	F(":65536");
-	F(":111111");
-	F(":-1");
-	F(":+1");
-	F(":9a7");
-#undef F
-#define T(i,l,o)\
-		str = i;\
-		addr.port = 0;\
-		assert(yuri__port(str, str-1+sizeof i, &addr) == str-1-l+sizeof i);\
-		assert(addr.port == o);
-	T(":1", 2, 1);
-	T(":65535", 6, 65535);
-	T("abcdefg:15", 3, 15);
-#undef T
 
-	/* yuri_validate_ipv4() */
-#define T(s) assert(yuri_validate_ipv4(s, -1+sizeof s) == 0)
-#define F(s) assert(yuri_validate_ipv4(s, -1+sizeof s) == -1)
-	F("");
-	F("0");
-	F("0.0.0.0.");
-	F(".0.0.0");
-	F(".0.0.0.0");
-	F("0.0..0.0");
-	F("256.255.255.255");
-	F("0.310.0.3");
-	F("-1.0.0.1");
-	T("0.0.0.0");
-	T("1.2.3.4");
-	T("0.9.10.50");
-	T("127.0.0.1");
-	T("255.255.255.255");
-	T("249.200.199.253");
-#undef T
-#undef F
+#define F(s) do {\
+		yuri_t uri;\
+		assert(yuri_parse_copy(s, &uri) == -1);\
+	} while(0)
 
-	/* yuri_validate_ipv6() */
-#define T(s) assert(yuri_validate_ipv6(s, -1+sizeof s) == 0)
-#define F(s) assert(yuri_validate_ipv6(s, -1+sizeof s) == -1)
+#define V(vscheme, vhost, vhosttype, vport, vpath, vquery, vfragment)\
+	assert(streq(uri.scheme, vscheme));\
+	assert(streq(uri.host, vhost));\
+	assert(uri.hosttype == vhosttype);\
+	assert(uri.port == vport);\
+	assert(streq(uri.path, vpath));\
+	assert(streq(uri.query, vquery));\
+	assert(streq(uri.fragment, vfragment));
+
+#define T(s, ...) do {\
+		yuri_t uri;\
+		assert(yuri_parse_copy(s, &uri) == 0);\
+		V(__VA_ARGS__)\
+		free(uri.buf);\
+	} while(0)
+
+
+static void t_parse() {
 	F("");
-	F("0");
-	F("0:0:0:0:0:0:0");
-	F("0:0:0:0:0:0:0:");
-	F(":0:0:0:0:0:0:0");
-	F("0:0:0:0:0:0:0:0:0");
-	F("0:0:0:0:0:0:0:0::");
-	F("::0:0:0:0:0:0:0:0");
-	F("0:0:0:0::0:0:0:0");
-	F("::0:0:0:0:0:0:0:0:0");
-	F("0:0:0:0:0:0:0::0");
-	F("::0::");
-	F("0::0::0");
-	F("::12345");
-	F("::FFFG");
-	F("[::]");
-	F("-::");
-	F("::-");
-	F("::0.0.0");
-	F("0:0:0:0:0:0.0.0.0");
-	F("0:0:0:0:0:0:0:0.0.0.0");
-	F("0:0:0:0:0:0.0.0.0:0");
-	T("::");
-	T("::0");
-	T("0::");
-	T("0::0");
-	T("::FFFF:1:12:123");
-	T("0:0:0::0:0:0:0");
-	T("0::0:0:0:0:0:0");
-	T("::0:0:0:0:0:0:0");
-	T("0:0:0:0:0:0:0::");
-	T("0:0:0:0:0:0::0");
-	T("0:0:0:0:0:0:0:0");
-	T("0000:0000:0000:0000:0000:0000:0000:0000");
-	T("000:000:000:000:000:000:000:000");
-	T("00:00:00:00:00:00:00:00");
-	T("::0.0.0.0");
-	T("0:0:0:0:0:0:0.0.0.0");
-	T("::0:0:0:0:0:0.0.0.0");
+
+	/* Scheme */
+#define FS(s) F(s"host")
+#define TS(s, a) T(s"host", a, "host", YURI_DOMAIN, 0, NULL, NULL, NULL)
+	FS(":");
+	FS("://");
+	FS("//");
+	FS(":/");
+	FS("a:");
+	FS("a:/");
+	FS(".://");
+	FS("abcdefghijklmnop://");
+	FS("9abc://");
+	FS("abc_d://");
+	TS("http://", "http");
+	TS("hTtp://", "http");
+	TS("abcdefghijklmno://", "abcdefghijklmno");
+	TS("ADC+adCs://", "adc+adcs");
+	TS("x://", "x");
+	TS("x.://", "x.");
+	TS("a.b+C://", "a.b+c");
+#undef TS
+#undef FS
+
+	/* Port */
+#define FP(s) F("host:"s)
+#define TP(s, v) T("host:"s, NULL, "host", YURI_DOMAIN, v, NULL, NULL, NULL)
+	FP("");
+	FP(":");
+	FP("0");
+	FP("012");
+	FP("65536");
+	FP("111111");
+	FP("-1");
+	FP("+1");
+	FP("9a7");
+	TP("1", 1);
+	TP("15", 15);
+	TP("65535", 65535);
+#undef FP
+#undef TP
+
+	/* IPv4 */
+#define F4(s) F("abc://"s"/")
+#define T4(s) T("abc://"s"/", "abc", s, YURI_IPV4, 0, "", NULL, NULL)
+	F4("");
+	F4("0");
+	F4("0.0.0.0.");
+	F4(".0.0.0");
+	F4(".0.0.0.0");
+	F4("0.0..0.0");
+	F4("256.255.255.255");
+	F4("0.310.0.3");
+	F4("-1.0.0.1");
+	F4("10.0.a0.0");
+	T4("0.0.0.0");
+	T4("1.2.3.4");
+	T4("0.9.10.50");
+	T4("127.0.0.1");
+	T4("255.255.255.255");
+	T4("249.200.199.253");
+#undef T4
+#undef F4
+
+	/* IPv6 */
+	F("::");
+	F("::1");
+	F("::0.0.0.0");
+	F("0:0:0:0:0:0:0:0");
+#define F6(s) F("abc://["s"]/")
+#define T6(s) T("abc://["s"]/", "abc", s, YURI_IPV6, 0, "", NULL, NULL)
+	F6("0");
+	F6("0:0:0:0:0:0:0");
+	F6("0:0:0:0:0:0:0:");
+	F6(":0:0:0:0:0:0:0");
+	F6("0:0:0:0:0:0:0:0:0");
+	F6("0:0:0:0:0:0:0:0::");
+	F6("::0:0:0:0:0:0:0:0");
+	F6("0:0:0:0::0:0:0:0");
+	F6("::0:0:0:0:0:0:0:0:0");
+	F6("0:0:0:0:0:0:0::0");
+	F6("::0::");
+	F6("0::0::0");
+	F6("::12345");
+	F6("::FFFG");
+	F6("[::]");
+	F6("-::");
+	F6("::-");
+	F6("::0.0.0");
+	F6("0:0:0:0:0:0.0.0.0");
+	F6("0:0:0:0:0:0:0:0.0.0.0");
+	F6("0:0:0:0:0:0.0.0.0:0");
+	T6("::");
+	T6("::0");
+	T6("0::");
+	T6("0::0");
+	T6("::FFFF:1:12:123");
+	T6("0:0:0::0:0:0:0");
+	T6("0::0:0:0:0:0:0");
+	T6("::0:0:0:0:0:0:0");
+	T6("0:0:0:0:0:0:0::");
+	T6("0:0:0:0:0:0::0");
+	T6("0:0:0:0:0:0:0:0");
+	T6("0000:0000:0000:0000:0000:0000:0000:0000");
+	T6("000:000:000:000:000:000:000:000");
+	T6("00:00:00:00:00:00:00:00");
+	T6("::0.0.0.0");
+	T6("0:0:0:0:0:0:0.0.0.0");
+	T6("::0:0:0:0:0:0.0.0.0");
 	/* Some examples from RFC3513 */
-	T("FEDC:BA98:7654:3210:FEDC:BA98:7654:3210");
-	T("1080:0:0:0:8:800:200C:417A");
-	T("FF01:0:0:0:0:0:0:101");
-	T("0:0:0:0:0:0:0:1");
-	T("1080::8:800:200C:417A");
-	T("FF01::101");
-	T("::1");
-	T("::13.1.68.3");
-	T("::FFFF:129.144.52.38");
-#undef T
-#undef F
+	T6("FEDC:BA98:7654:3210:FEDC:BA98:7654:3210");
+	T6("1080:0:0:0:8:800:200C:417A");
+	T6("FF01:0:0:0:0:0:0:101");
+	T6("0:0:0:0:0:0:0:1");
+	T6("1080::8:800:200C:417A");
+	T6("FF01::101");
+	T6("::1");
+	T6("::13.1.68.3");
+	T6("::FFFF:129.144.52.38");
+#undef T6
+#undef F6
 
-	/* yuri__validate_dns() */
-#define T(s) assert(yuri__validate_dns(s, -1+sizeof s) == 0)
-#define F(s) assert(yuri__validate_dns(s, -1+sizeof s) == -1)
-	F("");
-	F(".");
-	F(".com.");
-	F("a_c.com");
-	F("-ac.com");
-	F("ac-.com");
-	F("com.123");
-	F("255.255.255.255");
-	F("com.1-2.3.");
-	F("abc@com");
-	F("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789012.com");
-	F("abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklm");
-	T("com");
-	T("com.");
-	T("ac.com");
-	T("a-c.com");
-	T("a--c.com");
-	T("123.com");
-	T("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ12345678901.com");
-	T("abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijkl");
-#undef T
-#undef F
+	/* Domain */
+#define FD(s) F("abc://"s"/")
+#define TD(s) T("abc://"s"/", "abc", s, YURI_DOMAIN, 0, "", NULL, NULL)
+	FD(".");
+	FD(".com.");
+	FD("a_c.com");
+	FD("-ac.com");
+	FD("ac-.com");
+	FD("com.123");
+	FD("com.1-2.3.");
+	FD("abc@com");
+	FD("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789012.com");
+	FD("abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklm");
+	TD("com");
+	TD("com.");
+	TD("ac.com");
+	TD("a-c.com");
+	TD("a--c.com");
+	TD("123.com");
+	TD("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ12345678901.com");
+	TD("abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijklmnopqrstuvwxyz.abcdefghijkl");
+#undef TD
+#undef FD
 
-	/* yuri_parse() */
-#define F(s) assert(yuri_parse(s, &addr) == -1)
-	F("");
+	/* path, query, fragment */
+#define FC(s) F("abc://domain"s)
+#define TC(s, vp, vq, vf) T("abc://domain"s, "abc", "domain", YURI_DOMAIN, 0, vp, vq, vf)
+	FC("/%0g");
+	FC("?%0g");
+	FC("#%0g");
+	FC("##");
+	TC("", NULL, NULL, NULL);
+	TC("/?#", "", "", "");
+	TC("/abc", "abc", NULL, NULL);
+	TC("?abc", NULL, "abc", NULL);
+	TC("#abc", NULL, NULL, "abc");
+	TC("/%01?%02#%03", "%01", "%02", "%03");
+	TC("/abc/?abc/?#abc/?", "abc/", "abc/?", "abc/?");
+#undef TC
+#undef FC
+
+	/* Misc */
 	F("/");
-	F("::");
-	F("abc.com:");
 	F("blicky.net ");
 	F(" blicky.net");
 	F("//blicky.net");
 	F("abcdefghijklmnop://blicky.net/");
+}
+
 #undef F
-#define T(s, sch, ho, po, re)\
-		str = s;\
-		assert(yuri_parse(str, &addr) == 0);\
-		assert(strcmp(addr.scheme, sch) == 0);\
-		assert(strcmp(addr.host, ho) == 0);\
-		assert(addr.port == po);\
-		if(!*re)\
-			assert(addr.rest == NULL);\
-		else\
-			assert(strcmp(addr.rest, re) == 0);
-	T("blicky.net", "", "blicky.net", 0, "");
-	T("http://blicky.net", "http", "blicky.net", 0, "");
-	T("http://blicky.net/", "http", "blicky.net", 0, "");
-	T("http://blicky.net/ ", "http", "blicky.net", 0, " ");
-	T("http://blicky.net/abc", "http", "blicky.net", 0, "abc");
-	T("http://blicky.net:80/abc", "http", "blicky.net", 80, "abc");
-	T("irc://127.0.0.1/channel", "irc", "127.0.0.1", 0, "channel");
-	T("[::]", "", "::", 0, "");
-	T("[::]:65530", "", "::", 65530, "");
-	T("10.0.0.1?query", "", "10.0.0.1", 0, "?query");
-	T("10.0.0.1:12?query", "", "10.0.0.1", 12, "?query");
+#undef T
+#undef V
+
+
+
+#define F(s) do {\
+		char *buf = strdup(s);\
+		yuri_query_t q;\
+		assert(yuri_query_parse(buf, &q) == -1);\
+		free(buf);\
+	} while(0)
+
+#define T(s, ...) do {\
+		char *buf = strdup(s);\
+		char *args[] = {__VA_ARGS__};\
+		char *key, *value;\
+		size_t i;\
+		yuri_query_t q;\
+		assert(yuri_query_parse(buf, &q) == 0);\
+		assert(q.n == sizeof(args)/sizeof(*args)/2);\
+		assert(q.next == buf);\
+		for(i=0; i<sizeof(args)/sizeof(*args); i+=2) {\
+			assert(yuri_query_next(&q, &key, &value) == 1);\
+			assert(strcmp(key, args[i]) == 0);\
+			assert(strcmp(value, args[i+1]) == 0);\
+		}\
+		assert(yuri_query_next(&q, &key, &value) == 0);\
+		free(buf);\
+	} while(0)
+
+static void t_query() {
+	{ /* Should handle NULL */
+		yuri_query_t q;
+		char *key, *value;
+		assert(yuri_query_parse(NULL, &q) == 0);
+		assert(q.n == 0 && q.next == NULL);
+		assert(yuri_query_next(&q, &key, &value) == 0);
+	}
+
+	F("a");
+	F("abc=");
+	F("=abc");
+	F("a=b;a");
+	F("a=b;a=");
+	F("a=b;=a");
+	F("&");
+	F(";");
+	F("&abc=val");
+	F("abc&k=v");
+	F("ab=&k=v");
+	F("a=b&&k=v");
+	F("a=b;;k=v");
+	T("",);
+	T("k=v", "k", "v");
+	T("key=value", "key", "value");
+	T("%20=%6a", "\x20", "\x6a");
+	T("k=v;k=v&k=v", "k", "v", "k", "v", "k", "v");
+	T("a+b=b+a", "a b", "b a");
+}
+
+#undef F
+#undef T
+
+
+
+int main(int argc, char **argv) {
+	/* yuri_validate_escape() */
+#define T(s) assert(yuri_validate_escape(s) == 0)
+#define F(s) assert(yuri_validate_escape(s) == -1)
+	T("");
+	T("!@#$^&*()[]{}\\|=+-_,<>./?\"';:`~ \t\n");
+	T("%01%02%03  %abx%ABy%aBz%Ab %9f %f9 %9F %F9 ");
+	F("%00");
+	F("%");
+	F("%e");
+	F("%gg");
+	F("%1G");
+	F("%G1");
+	F("abc%f");
+	F("%fgabc");
+#undef T
+#undef F
+
+	/* yuri_unescape() */
+	assert(yuri_unescape(NULL) == NULL);
+#define T(s, a) do {\
+		char *buf = strdup(s);\
+		assert(yuri_unescape(buf) == buf);\
+		assert(strcmp(buf, a) == 0);\
+		free(buf);\
+	} while(0)
+	T("", "");
+	T("abc", "abc");
+	T("%20", "\x20");
+	T("abc%A1%ab%ff%01", "abc\xa1\xab\xff\x01");
 #undef T
 
+	t_query();
+	t_parse();
 	return 0;
 }
 
diff --git a/yuri.c b/yuri.c
index 0239e33..9c2f7f5 100644
--- a/yuri.c
+++ b/yuri.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012 Yoran Heling
+/* Copyright (c) 2012-2013 Yoran Heling
 
   Permission is hereby granted, free of charge, to any person obtaining
   a copy of this software and associated documentation files (the
@@ -21,7 +21,11 @@
 */
 
 #include "yuri.h"
+#include <string.h>
 #include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
 
 
 /* The ctype.h functions are locale-dependent. We don't want that. */
@@ -33,138 +37,57 @@
 #define y_ishex(x)    (((x) >= 'a' && (x) <= 'f') || ((x) >= 'A' && (x) <= 'F') || y_isnum(x))
 #define y_isscheme(x) ((x) == '+' || (x) == '-' || (x) == '.' || y_isalnum(x))
 #define y_isdomain(x) ((x) == '-' || y_isalnum(x))
+#define y_hexval(x)   ((x) >= '0' && (x) <= '9' ? (x)-'0' : (x) >= 'A' && (x) <= 'F' ? (x)-'A'+10 : (x)-'a'+10)
 
 
-/* Copy len bytes from *src to *dest. A nil character is appended to dest, so
- * it must be large enough to hold len+1 bytes.
- * XXX: This is *not* equivalent to BSD strlcpy()! */
-static void yuri__strlcpy(char *dest, const char *src, int len) {
-	while(len-- > 0)
-		*(dest++) = *(src++);
-	*dest = 0;
-}
-
-
-/* Similar to yuri__strlcpy(), except calls y_tolower() on each character */
-static void yuri__strllower(char *dest, const char *src, int len) {
-	while(len-- > 0) {
-		*dest = y_tolower(*src);
-		src++;
-		dest++;
-	}
-	*dest = 0;
-}
-
-
-/* Parses the "<scheme>://" part and returns the pointer after the scheme.
- * Simply returns 'in' if it couldn't find a (valid) scheme. */
-static const char *yuri__scheme(const char *in, yuri_t *out) {
-	const char *end = in;
-	if(!y_isalpha(*end))
-		return in;
+/* Parses the "<scheme>://" part, if it exists, and advances the buf pointer.
+ */
+static void yuri__scheme(char **buf, yuri_t *out) {
+	const char *end = *buf;
+	if(!y_isalpha(**buf))
+		return;
 	do
 		++end;
-	while(end <= in+15 && y_isscheme(*end));
-	if(end > in+15 || *end != ':' || end[1] != '/' || end[2] != '/')
-		return in;
-	yuri__strllower(out->scheme, in, end-in);
-	return end + 3;
+	while(end <= *buf+15 && y_isscheme(*end));
+	if(end > *buf+15 || *end != ':' || end[1] != '/' || end[2] != '/')
+		return;
+	/* Valid scheme, lowercase it and advance *buf. */
+	out->scheme = *buf;
+	while(*buf != end) {
+		**buf = y_tolower(**buf);
+		(*buf)++;
+	}
+	**buf = 0;
+	*buf += 3;
 }
 
 
-/* Parses the ":<port>" part in the string pointed to by [in..end]. Returns the
- * new end of the string, or the current end if it couldn't find a (valid)
- * port string. */
-static const char *yuri__port(const char *in, const char *end, yuri_t *out) {
-	const char *nend = end-1;
+/* Parses the ":<port>" part in buf and, if it exists, sets the ':' to zero to
+ * ensure that buf is a complete host string. */
+static void yuri__port(char *buf, size_t len, yuri_t *out) {
 	uint32_t res = 0, mul = 1;
+	if(!len)
+		return;
 	/* Read backwards */
-	while(nend >= in && y_isnum(*nend)) {
+	while(--len > 0 && y_isnum(buf[len])) {
 		if(mul >= 100000)
-			return end;
-		res += mul * (*nend-'0');
+			return;
+		res += mul * (buf[len]-'0');
 		if(res > 65535)
-			return end;
+			return;
 		mul *= 10;
-		nend--;
 	}
-	if(!res || nend < in || *nend != ':' || nend[1] == '0')
-		return end;
+	if(!res || !len || buf[len] != ':' || buf[len+1] == '0')
+		return;
 	out->port = res;
-	return nend;
-}
-
-
-/* RFC3986, p. 19, IPv4address. */
-int yuri_validate_ipv4(const char *str, int len) {
-	int i;
-	for(i=0; i<4; i++) {
-		if(i) {
-			if(len < 1 || *str != '.')
-				return -1;
-			str++; len--;
-		}
-		if(len >= 3 && ((str[0] == '2' && str[1] == '5' && str[2] >= '0' && str[2] <= '5')   /* 250-255 */
-		             || (str[0] == '2' && str[1] >= '0' && str[1] <= '4' && y_isnum(str[2])) /* 200-249 */
-		             || (str[0] == '1' && y_isnum(str[1]) && y_isnum(str[2])))               /* 100-199 */
-				) {
-			str += 3; len -= 3;
-		} else if(len >= 2 && str[0] >= '1' && str[0] <= '9' && y_isnum(str[1])) { /* 10-99 */
-			str += 2; len -= 2;
-		} else if(len >= 1 && y_isnum(str[0])) { /* 0-9 */
-			str++; len--;
-		} else
-			return -1;
-	}
-	return len ? -1 : 0;
-}
-
-
-int yuri_validate_ipv6(const char *str, int len) {
-	int i, hasskip = 0;
-	if(len >= 2 && *str == ':' && str[1] == ':') {
-		hasskip = 1;
-		str += 2; len -= 2;
-	}
-	for(i=0; i<8; i++) {
-		if(!len && hasskip)
-			break;
-		/* separator */
-		if(i) {
-			if(len < 1 || *str != ':')
-				return -1;
-			str++; len--;
-		}
-		if(len < 1)
-			return -1;
-		if(i && !hasskip && *str == ':') {
-			hasskip = 1;
-			str++; len--;
-			if(!len)
-				break;
-		}
-		/* last 32 bits may use IPv4 notation */
-		if(len >= 4 && (hasskip ? i < 6 : i == 6) && str[1] != ':' && str[2] != ':' && (str[1] == '.' || str[2] == '.' || str[3] == '.'))
-			return yuri_validate_ipv4(str, len);
-		/* 1-4 hex digits */
-		if(!y_ishex(*str))
-			return -1;
-		str++; len--;
-#define H if(len >= 1 && y_ishex(*str)) { str++; len--; }
-		H H H
-#undef H
-	}
-	return len || (hasskip && i==8) ? -1 : 0;
+	buf[len] = 0;
 }
 
 
 /* RFC1034 section 3.5 has an explanation of a (commonly used) domain syntax,
  * but I suspect it may be overly strict. This implementation will suffice, I
- * suppose. Unlike the IPv4 and IPv6 validators, this function is not public.
- * Mostly because DNS names aren't strictly specified, and because there are
- * alternative representations depending on where the name comes from (see also
- * the comment on the length check) */
-static int yuri__validate_dns(const char *str, int len) {
+ * suppose. */
+static int yuri__validate_domain(const char *str, int len) {
 	int haslabel = 0, /* whether we've seen a label */
 		lastishyp = 0, /* whether the last seen character in the label is a hyphen */
 		startdig = 0, /* whether the last seen label starts with a digit (Not allowed per RFC1738, a sensible restriction IMO) */
@@ -203,33 +126,208 @@ static int yuri__validate_dns(const char *str, int len) {
 }
 
 
-int yuri_parse(const char *in, yuri_t *out) {
-	const char *authend, *hostend;
+int yuri__host(char *buf, yuri_t *out) {
+	char addrbuf[16];
+
+	/* IPv6 */
+	if(*buf == '[') {
+		if(buf[strlen(buf)-1] != ']')
+			return -1;
+		buf++;
+		buf[strlen(buf)-1] = 0;
+		if(inet_pton(AF_INET6, buf, addrbuf) != 1)
+			return -1;
+		out->hosttype = YURI_IPV6;
+		out->host = buf;
+		return 0;
+	}
+
+	/* IPv4 */
+	if(inet_pton(AF_INET, buf, addrbuf) == 1) {
+		out->hosttype = YURI_IPV4;
+		out->host = buf;
+		return 0;
+	}
+
+	/* Domain */
+	out->hosttype = YURI_DOMAIN;
+	out->host = buf;
+	return yuri__validate_domain(buf, strlen(buf));
+}
+
+
+int yuri_parse(char *buf, yuri_t *out) {
+	char *end, endc;
 
-	*out->scheme = 0;
-	*out->host = 0;
-	out->port = 0;
-	out->rest = NULL;
+	memset(out, 0, sizeof(yuri_t));
+	out->buf = buf;
 
-	in = yuri__scheme(in, out);
+	yuri__scheme(&buf, out);
 
 	/* Find the end of the authority component (RFC3986, section 3.2) */
-	for(authend=in; *authend && *authend != '/' && *authend != '?' && *authend != '#'; authend++)
-		;
-
-	hostend = yuri__port(in, authend, out);
-	if(hostend-in > 2 && *in == '[' && *(hostend-1) == ']' && yuri_validate_ipv6(in+1, hostend-in-2) == 0)
-		yuri__strlcpy(out->host, in+1, hostend-in-2);
-	else if(yuri_validate_ipv4(in, hostend-in) == 0 || yuri__validate_dns(in, hostend-in) == 0)
-		yuri__strlcpy(out->host, in, hostend-in);
-	else
+	end = buf;
+	while(*end && *end != '/' && *end != '?' && *end != '#')
+		end++;
+	endc = *end;
+	*end = 0;
+
+	yuri__port(buf, end-buf, out);
+	if(yuri__host(buf, out))
+		return -1;
+
+	/* path */
+	if(endc == '/') {
+		out->path = ++end;
+		while(*end && *end != '?' && *end != '#')
+			end++;
+		endc = *end;
+		*end = 0;
+		if(yuri_validate_escape(out->path))
+			return -1;
+	}
+
+	/* query */
+	if(endc == '?') {
+		out->query = ++end;
+		while(*end && *end != '#')
+			end++;
+		endc = *end;
+		*end = 0;
+		if(yuri_validate_escape(out->query))
+			return -1;
+	}
+
+	/* fragment */
+	if(endc == '#') {
+		out->fragment = ++end;
+		while(*end)
+			if(*(end++) == '#')
+				return -1;
+		if(yuri_validate_escape(out->fragment))
+			return -1;
+	}
+
+	return 0;
+}
+
+
+int yuri_parse_copy(const char *str, yuri_t *out) {
+	char *buf = strdup(str);
+	if(!buf)
+		return -2;
+	if(yuri_parse(buf, out)) {
+		free(buf);
 		return -1;
+	}
+	return 0;
+}
 
-	if(*authend && *authend == '/')
-		authend++;
-	out->rest = *authend ? authend : NULL;
 
+int yuri_validate_escape(const char *str) {
+	while(*str) {
+		if(*str != '%') {
+			str++;
+			continue;
+		}
+		if(!y_ishex(str[1]) || !y_ishex(str[2]) || (str[1] == '0' && str[2] == '0'))
+			return -1;
+		str += 3;
+	}
 	return 0;
 }
 
+
+char *yuri_unescape(char *str) {
+	unsigned char *src = (unsigned char *)str, *dest = (unsigned char *)str;
+	if(!str)
+		return NULL;
+	while(*src) {
+		if(*src != '%') {
+			*(dest++) = *(src++);
+			continue;
+		}
+		*(dest++) = (y_hexval(src[1])<<4) | y_hexval(src[2]);
+		src += 3;
+	}
+	*dest = 0;
+	return str;
+}
+
+
+/* Special unescape function for the query string. Differs from yuri_unescape()
+ * in that it converts '+' to a space and that it zeros out any bytes that
+ * remain if the string has shrunk (necessary for yuri_query_next()). */
+static char *yuri__query_unescape(char *str) {
+	unsigned char *src = (unsigned char *)str, *dest = (unsigned char *)str;
+	while(*src) {
+		if(*src == '+') {
+			*(dest++) = ' ';
+			src++;
+			continue;
+		}
+		if(*src != '%') {
+			*(dest++) = *(src++);
+			continue;
+		}
+		*(dest++) = (y_hexval(src[1])<<4) | y_hexval(src[2]);
+		src += 3;
+	}
+	while(dest <= src)
+		*(dest++) = 0;
+	return str;
+}
+
+
+int yuri_query_parse(char *str, yuri_query_t *q) {
+	q->n = 0;
+	q->next = str;
+	if(!str)
+		return 0;
+
+	char *sep;
+	while(*str) {
+		/* Key */
+		sep = str;
+		while(*sep && *sep != '=' && *sep != ';' && *sep != '&')
+			sep++;
+		if(!*sep || *sep == ';' || *sep == '&' || sep == str)
+			return -1;
+		*(sep++) = 0;
+		yuri__query_unescape(str);
+		str = sep;
+
+		/* Value */
+		while(*sep && *sep != ';' && *sep != '&')
+			sep++;
+		if(sep == str)
+			return -1;
+		if(*sep)
+			*(sep++) = 0;
+		yuri__query_unescape(str);
+		q->n++;
+		str = sep;
+	}
+
+	return 0;
+}
+
+
+static void yuri__query_advance(yuri_query_t *q) {
+	q->next += strlen(q->next);
+	while(!*q->next)
+		q->next++;
+}
+
+
+int yuri_query_next(yuri_query_t *q, char **key, char **value) {
+	if(!q->n)
+		return 0;
+	*key = q->next;
+	yuri__query_advance(q);
+	*value = q->next;
+	if(--q->n)
+		yuri__query_advance(q);
+	return 1;
+}
+
 /* vim: set noet sw=4 ts=4: */
diff --git a/yuri.h b/yuri.h
index 0226a06..fe0cc2d 100644
--- a/yuri.h
+++ b/yuri.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012 Yoran Heling
+/* Copyright (c) 2012-2013 Yoran Heling
 
   Permission is hereby granted, free of charge, to any person obtaining
   a copy of this software and associated documentation files (the
@@ -25,7 +25,7 @@
  * - <host>:<port>
  * - <scheme>://<host>
  * - <scheme>://<host>:<port>
- * - <anything above>/<rest>
+ * - <anything above></path><?query><#fragment>
  *
  * <scheme> must match /^[a-zA-Z][a-zA-Z0-9\.+-]{0,14}$/
  * <host> is either:
@@ -35,18 +35,29 @@
  *     a maximum length of 255 characters. Actual parser is a bit more strict
  *     than the above regex.
  * <port> must be a decimal number between 1 and 65535 (both inclusive)
- * <rest> is, at this point, neither validated nor parsed
+ * </path> is an escaped string not containing '?' or '#'
+ * <?query> is an escaped string not containing '#'
+ * <#fragment> is an escaped string not containing '#'
+ * Any </path>, <?query> and/or <#fragment> components may be absent.
+ *
+ * The format of the <?query> part is highly dependent on the application and
+ * is therefore not automatically parsed. However, a simple parser is available
+ * for the common key=param style.
  *
  * Not supported (yet):
- * - Path and query string parsing
  * - Username / password parts
  * - Symbolic port names
  * - Internationalized domain names. Parsing only succeeds when the address
  *   is in the ASCII form.
- * - Protocol relative URLs (e.g. "//domain.com/")
- * - Percent encoding in anything before <rest> is not handled. Even though the
- *   RFC's seem to imply that this is allowed. (Percent-encoding in <rest>
- *   isn't handled, either, since this parser completely ignores <rest>)
+ * - Relative references (Protocol relative URLs), e.g. "//domain.com/"
+ * - Percent encoding in <host> and <port> is not handled. Even though the
+ *   RFC's seem to imply that this is allowed.
+ *
+ * URI unescaping is supported, but the %00 escape is explicitely NOT allowed
+ * and will cause parsing to fail with a validation error.  This makes this
+ * library unsuitable for schemes that use URI escaping to send binary data,
+ * such as BitTorrent tracker announcements. Non-standard %uxxxx escapes are
+ * not supported, either.
  *
  * RFC1738 and RFC3986 have been used as reference, but strict adherence to
  * those specifications isn't a direct goal. In particular, this parser allows
@@ -54,54 +65,172 @@
  * IPv4/IPv6/DNS address. This makes the parser suitable for schemes like
  * irc://, http://, ftp:// and adc://, but unsuitable for stuff like mailto:
  * and magnet:.
- *
- * Incidentally, the implementation (yuri.c) is written in pure C and does not
- * use any libc functions.
  */
 
 #ifndef YURI_H
 #define YURI_H
 
 #include <stdint.h>
+#include <stdlib.h>
 
-/* See description above for the supported formats. */
-typedef struct {
 
-	/* Empty string if there was no scheme in the URI. Uppercase characters
-	 * (A-Z) are automatically converted to lowercase (a-z). */
-	char scheme[16];
+typedef enum {
+	YURI_IPV6,
+	YURI_IPV4,
+	YURI_DOMAIN
+} yuri_hosttype_t;
 
-	/* IPv4/IPv6 address or hostname. The square brackets around the IPv6
-	 * address in the URI are not copied. No normalization or case modification
-	 * is performed. */
-	char host[256];
+
+/* See description above for the supported URI formats. */
+typedef struct {
+	/* Pointer to the start of the buffer. This is the buffer given to
+	 * yuri_parse(), or a newly created buffer in the case of
+	 * yuri_parse_copy(). */
+	char *buf;
+	/* All the pointers below point into the *buf memory. */
+
+	/* NULL if there was no scheme in the URI. Uppercase characters (A-Z) are
+	 * automatically converted to lowercase (a-z). */
+	char *scheme;
+
+	/* Hostname part of the URI. hosttype indicates what kind of hostname this
+	 * is (IPv4, IPv6 or a domain name).
+	 * No normalization or case modification on the host is performed. Any
+	 * square brackets around the IPv6 address in the URI are not considered
+	 * part of the hostname.  E.g. the URI "http://[::]/" has YURI_IPV6 and
+	 * host = "::". */
+	char *host;
+	yuri_hosttype_t hosttype;
 
 	/* 0 if no port was included in the URI. */
 	uint16_t port;
 
-	/* Points directly into the string given to yuri_parse(), NULL if there is
-	 * no rest. Points to the character after the '/', so in the case of
-	 * "example.com/path", rest will point to "path". */
-	const char *rest;
+	/* Unmodified path, query and fragment parts of the URI, not including the
+	 * first '/', '?' or '#' character, respectively. If a part was missing
+	 * from the URI, its value here is set to NULL. Note that it is possible
+	 * for a part to be present but empty, for example "http://blicky.net" has
+	 * all fields NULL, but "http://blicky.net/?#" has all fields set to an
+	 * empty string.
+	 *
+	 * These parts are passed through in the same form as they are present in
+	 * the URI. Unescaping is not automatically performed by yuri_parse()
+	 * because these components may include schema-specific delimiters and
+	 * encoding rules. If you just want their unescaped string representation,
+	 * you can always use yuri_unescape() on these fields. If you know that the
+	 * query string is in key=value format (most common), use the
+	 * yuri_query_parse() to parse it. */
+	char *path;
+	char *query;
+	char *fragment;
 } yuri_t;
 
 
-/* Returns -1 if the URI isn't valid, 0 on success. On failure, the
- * contents of out may contain rubbish, otherwise all fields will have been set
- * to their parsed value.
- * Attempts to do as much (sane) validation as possible. */
-int yuri_parse(const char *in, yuri_t *out);
+/* Returns -1 if the URI isn't valid, 0 on success. The given string should be
+ * zero-terminated and will be modified in-place.
+ *
+ * If the URI is invalid, both the `str' and `out' arguments may have been
+ * partially written to and may therefore contain rubbish.
+ *
+ * This function attempts to do as much (sane) validation as possible. */
+int yuri_parse(char *str, yuri_t *out);
+
+
+/* Similar to yuri_parse(), but makes an internal copy of the string before
+ * processing. Returns -2 on OOM.
+ *
+ * When this function returns 0, you must call free(out->buf) after you're done
+ * with the parsed results. */
+int yuri_parse_copy(const char *str, yuri_t *out);
+
+
+/* Validates whether a string has been correctly escaped. This function should
+ * be used before calling yuri_unescape() on a string obtained from an
+ * untrusted source. Note that validation on the 'path', 'query' and 'fragment'
+ * fields in the yuri_t struct is not necessary, as yuri_parse() will do this
+ * already.
+ * A string is considered valid if any % characters are followed by two hex
+ * characters and there is no %00 escape. */
+int yuri_validate_escape(const char *str);
 
 
-/* Validates an IPv4 address according to RFC3986. Returns 0 if it's valid, -1
- * if it isn't. (Note that RFC3986 only allows a full IPv4 address with all
- * four octets present) */
-int yuri_validate_ipv4(const char *str, int len);
+/* Unescapes the given string in-place. That is, it converts %XX escapes into
+ * their byte representation. Returns the string given as first argument, so
+ * you can use it as yuri_unescape(strdup(str)) if you want to allocate a new
+ * string. This function simply passes through NULL if str is NULL.
+ *
+ * IMPORTANT: This function does not perform any validation. Behaviour is
+ * undefined when used on an invalid string. Use yuri_validate_escape() if you
+ * do not know whether the string is valid or not.
+ *
+ * IMPORTANT#2: You should only call this function on the same string once. For
+ * example, you can do a:
+ *   char *unescaped_path = yuri_unscape(uri->path);
+ * to get the path once. After that you can access the unescaped path directly
+ * through uri->path. The original path is then not available anymore, and
+ * calling yuri_unescape(uri->path) another time is an error.
+ *
+ * IMPORTANT#3: You can't expect the returned string to be valid UTF-8 or to
+ * not contain any weird (e.g. control) characters. If you want to do any
+ * further validation on the strings obtained from a URI, you must do so AFTER
+ * calling this function. */
+char *yuri_unescape(char *str);
 
 
-/* Validates an IPv6 address. Returns -1 if it's invalid, 0 if it is. The given
- * string should not include square brackets. */
-int yuri_validate_ipv6(const char *str, int len);
+
+/* Simple query string parser. Parses both "a=b&c=d", "a=b;c=d" and a mixture
+ * of the two styles. This API is used as follows:
+ *
+ *   yuri_t uri;
+ *   yuri_query_t q;
+ *   if(yuri_parse(str, &uri) || yuri_query_parse(uri->query, &q))
+ *     // handle error
+ *
+ *   char *key, *value;
+ *   while(yuri_query_next(&q, &key, &value)) {
+ *     // Do something
+ *   }
+ */
+typedef struct {
+	char *next; /* Pointer to the next key returned by _next. */
+	size_t n;   /* Number of key/value pairs left */
+} yuri_query_t;
+
+
+/* Parses and validates a key=value-style query string. The given string is
+ * modified in-place. On success, it writes an iterator into the given
+ * yuri_query_t object and returns 0. Returns -1 on failure, after which both
+ * str and q may have been modified and may contain garbage. str may be NULL,
+ * in which case is it considered equivalent to an empty string, which is an
+ * empty query string with 0 key/value pairs.
+ *
+ * This function only returns an error on things like empty keys ("=abc"),
+ * empty pairs ("&&"), empty values ("key=") or absent values ("key"). The '+'
+ * character is converted to a space in both keys and values. The same key may
+ * appear multiple times.  TODO: Add support absent or empty values.
+ *
+ * IMPORTANT: The given string is assumed to contain valid URI escapes, as in
+ * yuri_validate_escape(), so run that function first if the string comes from
+ * an untrusted source.
+ *
+ * IMPORTANT#2: You should only call this function on the same string once, and
+ * you should not have called yuri_unescape() on that string (unless, of
+ * course, you want to parse a query string encoded inside a query string, or
+ * whatever such scheme you may use).
+ *
+ * You can re-use the same iterator multiple times by making a copy of the
+ * struct before calling yuri_query_next() for the first time. */
+int yuri_query_parse(char *str, yuri_query_t *q);
+
+
+/* Get the next key/value pair from the yuri_query_t iterator. Returns 0 if
+ * there are no more pairs, 1 otherwise.
+ *
+ * The keys and values are returned in their unescaped form, so no further
+ * calls to yuri_unescape() are necessary. The key and value strings point into
+ * the string buffer given to yuri_query_parse().
+ *
+ * The IMPORTANT#3 note of yuri_unescape() applies here, too. */
+int yuri_query_next(yuri_query_t *q, char **key, char **value);
 
 #endif
author	Yorhel <git@yorhel.nl>	2013-05-24 17:23:07 +0200
committer	Yorhel <git@yorhel.nl>	2013-05-24 17:59:26 +0200
commit	33b664657033beb33d050c2ef5a6cb1d6b77adff (patch)
tree	6c30a14323a1b26d9d78f9d139ea666c7f92dba3
parent	1506844fabef3a698f6c7454cac932149fb075bf (diff)