IRI support

This extends the URI parser so it supports full IRI (Internationalized Resource Identifiers, RFC3987). Some areas of it can/may be improved, but here's a start. Note: we assume UTF-8 encoded IRI.
author: Omar Polo <op@omarpolo.com> 2020-12-26 00:33:11 +0100
committer: Omar Polo <op@omarpolo.com> 2020-12-26 00:33:11 +0100
commit: df6ca41da36c3f617cbbf3302ab120721ebfcfd2 (patch)
tree: eb0d8da6e94702a2c4ee636ec9a739a8be7b4817
parent: 043acc97b16be18d85bb1914da50f7ce2aa2623e (diff)
4 files changed, 108 insertions, 12 deletions
diff --git a/README.md b/README.md
index 597391a..1c9b75f 100644
--- a/README.md
+++ b/README.md
@@ -20,11 +20,8 @@ is a very simple and minimal gemini server that can serve static files
 and execute CGI scripts.
 
 **gmid**
-will strip any sequence of
-*../*
-or trailing
-*..*
-in the requests made by clients and will refuse to follow symlinks.
+won't serve files outside the given directory and won't follow
+symlinks.
 Furthermore, on
 OpenBSD,
 pledge(2)
@@ -35,6 +32,10 @@ are used to ensure that
 dosen't do anything else than read files from the given directory,
 accept network connections and, optionally, execute CGI scripts.
 
+**gmid**
+fully supports IRIs (Internationalized Resource Identifiers, see
+RFC3987).
+
 It should be noted that
 **gmid**
 is very simple in its implementation, and so it may not be appropriate
diff --git a/gmid.1 b/gmid.1
index 77ef87d..edf67d5 100644
--- a/gmid.1
+++ b/gmid.1
@@ -33,11 +33,8 @@ is a very simple and minimal gemini server that can serve static files
 and execute CGI scripts.
 .Pp
 .Nm
-will strip any sequence of
-.Pa ../
-or trailing
-.Pa ..
-in the requests made by clients and will refuse to follow symlinks.
+won't serve files outside the given directory and won't follow
+symlinks.
 Furthermore, on
 .Ox ,
 .Xr pledge 2
@@ -48,6 +45,10 @@ are used to ensure that
 dosen't do anything else than read files from the given directory,
 accept network connections and, optionally, execute CGI scripts.
 .Pp
+.Nm
+fully supports IRIs (Internationalized Resource Identifiers, see
+RFC3987).
+.Pp
 It should be noted that
 .Nm
 is very simple in its implementation, and so it may not be appropriate
diff --git a/uri.c b/uri.c
index 245928a..3f81b76 100644
--- a/uri.c
+++ b/uri.c
@@ -93,6 +93,8 @@ struct parser {
 	const char	*err;
 };
 
+#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
+
 /* XXX: these macros will expand multiple times their argument */
 
 #define UNRESERVED(p)				\
@@ -115,6 +117,48 @@ struct parser {
 	    || p == ';'				\
 	    || p == '=')
 
+/* NOTE: the increment are one less what it should be, because the
+ * caller will add one byte after we return. */
+static int
+valid_multibyte_utf8(struct parser *p)
+{
+	uint32_t c;
+	uint8_t s;
+
+	c = 0;
+	s = *p->uri;
+
+	if ((s & 0xE0) == 0xC0) {
+		if (!CONT_BYTE(*(p->uri+1)))
+			return 0;
+		c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
+		p->uri += 1;
+	} else if ((s & 0xF0) == 0xE0) {
+		if (!CONT_BYTE(*(p->uri+1)) ||
+		    !CONT_BYTE(*(p->uri+2)))
+			return 0;
+		c = (s & 0x0F) << 12
+			| ((*(p->uri+1) & 0x3F) << 6)
+			| ((*(p->uri+2) & 0x3F));
+		p->uri += 2;
+	} else if ((s & 0xF8) == 0xF0) {
+		if (!CONT_BYTE(*(p->uri+1)) ||
+		    !CONT_BYTE(*(p->uri+2)) ||
+		    !CONT_BYTE(*(p->uri+3)))
+			return 0;
+		c = (s & 0x07) << 18
+			| ((*(p->uri+1) & 0x3F) << 12)
+			| ((*(p->uri+2) & 0x3F) << 6)
+			| ((*(p->uri+3) & 0x3F));
+		p->uri += 3;
+	} else
+		return 0;
+
+	return (((0x080 <= c) && (c <= 0x7FF))
+	    || (((0x800 <= c) && (c <= 0xFFFF)))
+	    || (((0x10000 <= c) && (c <= 0x10FFFF))));
+}
+
 static int
 parse_pct_encoded(struct parser *p)
 {
@@ -308,7 +352,8 @@ parse_query(struct parser *p)
 	    || SUB_DELIMITERS(*p->uri)
 	    || *p->uri == '/'
 	    || *p->uri == '?'
-	    || parse_pct_encoded(p))
+	    || parse_pct_encoded(p)
+	    || valid_multibyte_utf8(p))
 		p->uri++;
 
 	if (*p->uri != '\0' && *p->uri != '#') {
@@ -348,7 +393,8 @@ parse_path(struct parser *p)
 	while (UNRESERVED(*p->uri)
 	    || SUB_DELIMITERS(*p->uri)
 	    || *p->uri == '/'
-	    || parse_pct_encoded(p))
+	    || parse_pct_encoded(p)
+	    || valid_multibyte_utf8(p))
 		p->uri++;
 
 	if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') {
diff --git a/uri_test.c b/uri_test.c
index c6521f6..f322c1e 100644
--- a/uri_test.c
+++ b/uri_test.c
@@ -87,6 +87,12 @@ main(void)
 {
 	struct uri empty = {"", "", "", PASS, "", "", ""};
 
+	TEST("foo://bar.com/foo%00?baz",
+	    FAIL,
+	    empty,
+	    "rejects %00");
+	return 0;
+
 	TEST("http://omarpolo.com",
 	    PASS,
 	    URI("http", "omarpolo.com", "", "", "", ""),
@@ -153,6 +159,10 @@ main(void)
 	    FAIL,
             empty,
 	    "reject paths that would escape the root");
+	TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/",
+	    PASS,
+            URI("gemini", "omarpolo.com", "", "", "", ""),
+	    "parse path with lots of cleaning available");
 
 	/* query */
 	TEST("foo://example.com/foo/?gne",
@@ -179,6 +189,44 @@ main(void)
 	    PASS,
 	    URI("foo", "bar.com", "", "cafè.gmi", "", ""),
 	    "can decode");
+	TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""),
+	    "can decode");
+	TEST("foo://bar.com/caff%C3%A8+macchiato.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""),
+	    "can decode");
+	TEST("foo://bar.com/foo%2F..%2F..",
+	    FAIL,
+	    empty,
+	    "conversion and checking are done in the correct order");
+	TEST("foo://bar.com/foo%00?baz",
+	    FAIL,
+	    empty,
+	    "rejects %00");
+
+	/* IRI */
+        TEST("foo://bar.com/cafè.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "cafè.gmi", "" , ""),
+	    "decode IRI (with a 2-byte utf8 seq)");
+	TEST("foo://bar.com/世界.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "世界.gmi", "" , ""),
+	    "decode IRI");
+	TEST("foo://bar.com/😼.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "😼.gmi", "" , ""),
+	    "decode IRI (with a 3-byte utf8 seq)");
+	TEST("foo://bar.com/😼/𤭢.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""),
+	    "decode IRI (with a 3-byte and a 4-byte utf8 seq)");
+	TEST("foo://bar.com/世界/\xC0\x80",
+	    FAIL,
+	    empty,
+	    "reject invalid sequence (overlong NUL)");
 
 	return 0;
 }
author	Omar Polo <op@omarpolo.com>	2020-12-26 00:33:11 +0100
committer	Omar Polo <op@omarpolo.com>	2020-12-26 00:33:11 +0100
commit	df6ca41da36c3f617cbbf3302ab120721ebfcfd2 (patch)
tree	eb0d8da6e94702a2c4ee636ec9a739a8be7b4817
parent	043acc97b16be18d85bb1914da50f7ce2aa2623e (diff)