diff options
author | Omar Polo <op@omarpolo.com> | 2020-12-26 00:33:11 +0100 |
---|---|---|
committer | Omar Polo <op@omarpolo.com> | 2020-12-26 00:33:11 +0100 |
commit | df6ca41da36c3f617cbbf3302ab120721ebfcfd2 (patch) | |
tree | eb0d8da6e94702a2c4ee636ec9a739a8be7b4817 | |
parent | 043acc97b16be18d85bb1914da50f7ce2aa2623e (diff) |
IRI support
This extends the URI parser so it supports full IRI (Internationalized
Resource Identifiers, RFC3987). Some areas of it can/may be improved,
but here's a start.
Note: we assume UTF-8 encoded IRI.
-rw-r--r-- | README.md | 11 | ||||
-rw-r--r-- | gmid.1 | 11 | ||||
-rw-r--r-- | uri.c | 50 | ||||
-rw-r--r-- | uri_test.c | 48 |
4 files changed, 108 insertions, 12 deletions
@@ -20,11 +20,8 @@ is a very simple and minimal gemini server that can serve static files and execute CGI scripts. **gmid** -will strip any sequence of -*../* -or trailing -*..* -in the requests made by clients and will refuse to follow symlinks. +won't serve files outside the given directory and won't follow +symlinks. Furthermore, on OpenBSD, pledge(2) @@ -35,6 +32,10 @@ are used to ensure that dosen't do anything else than read files from the given directory, accept network connections and, optionally, execute CGI scripts. +**gmid** +fully supports IRIs (Internationalized Resource Identifiers, see +RFC3987). + It should be noted that **gmid** is very simple in its implementation, and so it may not be appropriate @@ -33,11 +33,8 @@ is a very simple and minimal gemini server that can serve static files and execute CGI scripts. .Pp .Nm -will strip any sequence of -.Pa ../ -or trailing -.Pa .. -in the requests made by clients and will refuse to follow symlinks. +won't serve files outside the given directory and won't follow +symlinks. Furthermore, on .Ox , .Xr pledge 2 @@ -48,6 +45,10 @@ are used to ensure that dosen't do anything else than read files from the given directory, accept network connections and, optionally, execute CGI scripts. .Pp +.Nm +fully supports IRIs (Internationalized Resource Identifiers, see +RFC3987). +.Pp It should be noted that .Nm is very simple in its implementation, and so it may not be appropriate @@ -93,6 +93,8 @@ struct parser { const char *err; }; +#define CONT_BYTE(b) ((b & 0xC0) == 0x80) + /* XXX: these macros will expand multiple times their argument */ #define UNRESERVED(p) \ @@ -115,6 +117,48 @@ struct parser { || p == ';' \ || p == '=') +/* NOTE: the increment are one less what it should be, because the + * caller will add one byte after we return. */ +static int +valid_multibyte_utf8(struct parser *p) +{ + uint32_t c; + uint8_t s; + + c = 0; + s = *p->uri; + + if ((s & 0xE0) == 0xC0) { + if (!CONT_BYTE(*(p->uri+1))) + return 0; + c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F); + p->uri += 1; + } else if ((s & 0xF0) == 0xE0) { + if (!CONT_BYTE(*(p->uri+1)) || + !CONT_BYTE(*(p->uri+2))) + return 0; + c = (s & 0x0F) << 12 + | ((*(p->uri+1) & 0x3F) << 6) + | ((*(p->uri+2) & 0x3F)); + p->uri += 2; + } else if ((s & 0xF8) == 0xF0) { + if (!CONT_BYTE(*(p->uri+1)) || + !CONT_BYTE(*(p->uri+2)) || + !CONT_BYTE(*(p->uri+3))) + return 0; + c = (s & 0x07) << 18 + | ((*(p->uri+1) & 0x3F) << 12) + | ((*(p->uri+2) & 0x3F) << 6) + | ((*(p->uri+3) & 0x3F)); + p->uri += 3; + } else + return 0; + + return (((0x080 <= c) && (c <= 0x7FF)) + || (((0x800 <= c) && (c <= 0xFFFF))) + || (((0x10000 <= c) && (c <= 0x10FFFF)))); +} + static int parse_pct_encoded(struct parser *p) { @@ -308,7 +352,8 @@ parse_query(struct parser *p) || SUB_DELIMITERS(*p->uri) || *p->uri == '/' || *p->uri == '?' - || parse_pct_encoded(p)) + || parse_pct_encoded(p) + || valid_multibyte_utf8(p)) p->uri++; if (*p->uri != '\0' && *p->uri != '#') { @@ -348,7 +393,8 @@ parse_path(struct parser *p) while (UNRESERVED(*p->uri) || SUB_DELIMITERS(*p->uri) || *p->uri == '/' - || parse_pct_encoded(p)) + || parse_pct_encoded(p) + || valid_multibyte_utf8(p)) p->uri++; if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') { @@ -87,6 +87,12 @@ main(void) { struct uri empty = {"", "", "", PASS, "", "", ""}; + TEST("foo://bar.com/foo%00?baz", + FAIL, + empty, + "rejects %00"); + return 0; + TEST("http://omarpolo.com", PASS, URI("http", "omarpolo.com", "", "", "", ""), @@ -153,6 +159,10 @@ main(void) FAIL, empty, "reject paths that would escape the root"); + TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/", + PASS, + URI("gemini", "omarpolo.com", "", "", "", ""), + "parse path with lots of cleaning available"); /* query */ TEST("foo://example.com/foo/?gne", @@ -179,6 +189,44 @@ main(void) PASS, URI("foo", "bar.com", "", "cafè.gmi", "", ""), "can decode"); + TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi", + PASS, + URI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""), + "can decode"); + TEST("foo://bar.com/caff%C3%A8+macchiato.gmi", + PASS, + URI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""), + "can decode"); + TEST("foo://bar.com/foo%2F..%2F..", + FAIL, + empty, + "conversion and checking are done in the correct order"); + TEST("foo://bar.com/foo%00?baz", + FAIL, + empty, + "rejects %00"); + + /* IRI */ + TEST("foo://bar.com/cafè.gmi", + PASS, + URI("foo", "bar.com", "", "cafè.gmi", "" , ""), + "decode IRI (with a 2-byte utf8 seq)"); + TEST("foo://bar.com/世界.gmi", + PASS, + URI("foo", "bar.com", "", "世界.gmi", "" , ""), + "decode IRI"); + TEST("foo://bar.com/😼.gmi", + PASS, + URI("foo", "bar.com", "", "😼.gmi", "" , ""), + "decode IRI (with a 3-byte utf8 seq)"); + TEST("foo://bar.com/😼/𤭢.gmi", + PASS, + URI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""), + "decode IRI (with a 3-byte and a 4-byte utf8 seq)"); + TEST("foo://bar.com/世界/\xC0\x80", + FAIL, + empty, + "reject invalid sequence (overlong NUL)"); return 0; } |