aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOmar Polo <op@omarpolo.com>2020-12-26 00:33:11 +0100
committerOmar Polo <op@omarpolo.com>2020-12-26 00:33:11 +0100
commitdf6ca41da36c3f617cbbf3302ab120721ebfcfd2 (patch)
treeeb0d8da6e94702a2c4ee636ec9a739a8be7b4817
parent043acc97b16be18d85bb1914da50f7ce2aa2623e (diff)
IRI support
This extends the URI parser so it supports full IRI (Internationalized Resource Identifiers, RFC3987). Some areas of it can/may be improved, but here's a start. Note: we assume UTF-8 encoded IRI.
-rw-r--r--README.md11
-rw-r--r--gmid.111
-rw-r--r--uri.c50
-rw-r--r--uri_test.c48
4 files changed, 108 insertions, 12 deletions
diff --git a/README.md b/README.md
index 597391a..1c9b75f 100644
--- a/README.md
+++ b/README.md
@@ -20,11 +20,8 @@ is a very simple and minimal gemini server that can serve static files
and execute CGI scripts.
**gmid**
-will strip any sequence of
-*../*
-or trailing
-*..*
-in the requests made by clients and will refuse to follow symlinks.
+won't serve files outside the given directory and won't follow
+symlinks.
Furthermore, on
OpenBSD,
pledge(2)
@@ -35,6 +32,10 @@ are used to ensure that
dosen't do anything else than read files from the given directory,
accept network connections and, optionally, execute CGI scripts.
+**gmid**
+fully supports IRIs (Internationalized Resource Identifiers, see
+RFC3987).
+
It should be noted that
**gmid**
is very simple in its implementation, and so it may not be appropriate
diff --git a/gmid.1 b/gmid.1
index 77ef87d..edf67d5 100644
--- a/gmid.1
+++ b/gmid.1
@@ -33,11 +33,8 @@ is a very simple and minimal gemini server that can serve static files
and execute CGI scripts.
.Pp
.Nm
-will strip any sequence of
-.Pa ../
-or trailing
-.Pa ..
-in the requests made by clients and will refuse to follow symlinks.
+won't serve files outside the given directory and won't follow
+symlinks.
Furthermore, on
.Ox ,
.Xr pledge 2
@@ -48,6 +45,10 @@ are used to ensure that
dosen't do anything else than read files from the given directory,
accept network connections and, optionally, execute CGI scripts.
.Pp
+.Nm
+fully supports IRIs (Internationalized Resource Identifiers, see
+RFC3987).
+.Pp
It should be noted that
.Nm
is very simple in its implementation, and so it may not be appropriate
diff --git a/uri.c b/uri.c
index 245928a..3f81b76 100644
--- a/uri.c
+++ b/uri.c
@@ -93,6 +93,8 @@ struct parser {
const char *err;
};
+#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
+
/* XXX: these macros will expand multiple times their argument */
#define UNRESERVED(p) \
@@ -115,6 +117,48 @@ struct parser {
|| p == ';' \
|| p == '=')
+/* NOTE: the increment are one less what it should be, because the
+ * caller will add one byte after we return. */
+static int
+valid_multibyte_utf8(struct parser *p)
+{
+ uint32_t c;
+ uint8_t s;
+
+ c = 0;
+ s = *p->uri;
+
+ if ((s & 0xE0) == 0xC0) {
+ if (!CONT_BYTE(*(p->uri+1)))
+ return 0;
+ c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
+ p->uri += 1;
+ } else if ((s & 0xF0) == 0xE0) {
+ if (!CONT_BYTE(*(p->uri+1)) ||
+ !CONT_BYTE(*(p->uri+2)))
+ return 0;
+ c = (s & 0x0F) << 12
+ | ((*(p->uri+1) & 0x3F) << 6)
+ | ((*(p->uri+2) & 0x3F));
+ p->uri += 2;
+ } else if ((s & 0xF8) == 0xF0) {
+ if (!CONT_BYTE(*(p->uri+1)) ||
+ !CONT_BYTE(*(p->uri+2)) ||
+ !CONT_BYTE(*(p->uri+3)))
+ return 0;
+ c = (s & 0x07) << 18
+ | ((*(p->uri+1) & 0x3F) << 12)
+ | ((*(p->uri+2) & 0x3F) << 6)
+ | ((*(p->uri+3) & 0x3F));
+ p->uri += 3;
+ } else
+ return 0;
+
+ return (((0x080 <= c) && (c <= 0x7FF))
+ || (((0x800 <= c) && (c <= 0xFFFF)))
+ || (((0x10000 <= c) && (c <= 0x10FFFF))));
+}
+
static int
parse_pct_encoded(struct parser *p)
{
@@ -308,7 +352,8 @@ parse_query(struct parser *p)
|| SUB_DELIMITERS(*p->uri)
|| *p->uri == '/'
|| *p->uri == '?'
- || parse_pct_encoded(p))
+ || parse_pct_encoded(p)
+ || valid_multibyte_utf8(p))
p->uri++;
if (*p->uri != '\0' && *p->uri != '#') {
@@ -348,7 +393,8 @@ parse_path(struct parser *p)
while (UNRESERVED(*p->uri)
|| SUB_DELIMITERS(*p->uri)
|| *p->uri == '/'
- || parse_pct_encoded(p))
+ || parse_pct_encoded(p)
+ || valid_multibyte_utf8(p))
p->uri++;
if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') {
diff --git a/uri_test.c b/uri_test.c
index c6521f6..f322c1e 100644
--- a/uri_test.c
+++ b/uri_test.c
@@ -87,6 +87,12 @@ main(void)
{
struct uri empty = {"", "", "", PASS, "", "", ""};
+ TEST("foo://bar.com/foo%00?baz",
+ FAIL,
+ empty,
+ "rejects %00");
+ return 0;
+
TEST("http://omarpolo.com",
PASS,
URI("http", "omarpolo.com", "", "", "", ""),
@@ -153,6 +159,10 @@ main(void)
FAIL,
empty,
"reject paths that would escape the root");
+ TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/",
+ PASS,
+ URI("gemini", "omarpolo.com", "", "", "", ""),
+ "parse path with lots of cleaning available");
/* query */
TEST("foo://example.com/foo/?gne",
@@ -179,6 +189,44 @@ main(void)
PASS,
URI("foo", "bar.com", "", "cafè.gmi", "", ""),
"can decode");
+ TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi",
+ PASS,
+ URI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""),
+ "can decode");
+ TEST("foo://bar.com/caff%C3%A8+macchiato.gmi",
+ PASS,
+ URI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""),
+ "can decode");
+ TEST("foo://bar.com/foo%2F..%2F..",
+ FAIL,
+ empty,
+ "conversion and checking are done in the correct order");
+ TEST("foo://bar.com/foo%00?baz",
+ FAIL,
+ empty,
+ "rejects %00");
+
+ /* IRI */
+ TEST("foo://bar.com/cafè.gmi",
+ PASS,
+ URI("foo", "bar.com", "", "cafè.gmi", "" , ""),
+ "decode IRI (with a 2-byte utf8 seq)");
+ TEST("foo://bar.com/世界.gmi",
+ PASS,
+ URI("foo", "bar.com", "", "世界.gmi", "" , ""),
+ "decode IRI");
+ TEST("foo://bar.com/😼.gmi",
+ PASS,
+ URI("foo", "bar.com", "", "😼.gmi", "" , ""),
+ "decode IRI (with a 3-byte utf8 seq)");
+ TEST("foo://bar.com/😼/𤭢.gmi",
+ PASS,
+ URI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""),
+ "decode IRI (with a 3-byte and a 4-byte utf8 seq)");
+ TEST("foo://bar.com/世界/\xC0\x80",
+ FAIL,
+ empty,
+ "reject invalid sequence (overlong NUL)");
return 0;
}