implement a valid RFC3986 (URI) parser

Up until now I used a "poor man" approach: the uri parser is barely a parser, it tries to extract the path from the request, with some minor checking, and that's all. This obviously is not RFC3986-compliant. The new RFC3986 (URI) parser should be fully compliant. It may accept some invalid URI, but shouldn't reject or mis-parse valid URI. (in particular, the rule for the path is way more relaxed in this parser than it is in the RFC text). A difference with RFC3986 is that we don't even try to parse the (optional) userinfo part of a URI: following the Gemini spec we treat it as an error. A further caveats is that %2F in the path part of the URI is indistinguishable from a literal '/': this is NOT conforming, but due to the scope and use of gmid, I don't see how treat a %2F sequence in the path (reject the URI?).
author: Omar Polo <op@omarpolo.com> 2020-12-25 13:13:12 +0100
committer: Omar Polo <op@omarpolo.com> 2020-12-25 13:13:12 +0100
commit: 33d32d1fd66a577f22f3f33f238e8dac44ec9995 (patch)
tree: f9010d36f92d9239d0b80c87d9b57ee10fd4776d
parent: d5aba4c791266e35cf79cec02dcd15267fb62f62 (diff)
6 files changed, 659 insertions, 154 deletions
diff --git a/.gitignore b/.gitignore
index 9b1c514..a7794ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,5 +2,6 @@ cert.pem
 key.pem
 TAGS
 gmid
+uri_test
 *.o
 docs
diff --git a/Makefile b/Makefile
index 6c49573..1d355e3 100644
--- a/Makefile
+++ b/Makefile
@@ -2,18 +2,24 @@ CC =		cc
 CFLAGS =	-Wall -Wextra -g
 LDFLAGS =	-ltls
 
-.PHONY: all clean
+.PHONY: all clean test
 
 all: gmid TAGS README.md
 
-gmid: gmid.o
-	${CC} gmid.o -o gmid ${LDFLAGS}
+gmid: gmid.o uri.o
+	${CC} gmid.o uri.o -o gmid ${LDFLAGS}
 
-TAGS: gmid.c
-	-etags gmid.c || true
+TAGS: gmid.c uri.c
+	-etags gmid.c uri.c || true
 
 README.md: gmid.1
 	mandoc -Tmarkdown gmid.1 | sed -e '1d' -e '$$d' > README.md
 
 clean:
-	rm -f gmid.o gmid
+	rm -f *.o gmid
+
+uri_test: uri_test.o uri.o
+	${CC} uri_test.o uri.o -o uri_test ${LDFLAGS}
+
+test: uri_test
+	./uri_test
diff --git a/gmid.c b/gmid.c
index 7376a34..fb98cfc 100644
--- a/gmid.c
+++ b/gmid.c
@@ -34,6 +34,28 @@ int port;
 int foreground;
 int connected_clients;
 
+struct etm {			/* file extension to mime */
+	const char	*mime;
+	const char	*ext;
+} filetypes[] = {
+	{"application/pdf",	"pdf"},
+
+	{"image/gif",		"gif"},
+	{"image/jpeg",		"jpg"},
+	{"image/jpeg",		"jpeg"},
+	{"image/png",		"png"},
+	{"image/svg+xml",	"svg"},
+
+	{"text/gemini",		"gemini"},
+	{"text/gemini",		"gmi"},
+	{"text/markdown",	"markdown"},
+	{"text/markdown",	"md"},
+	{"text/plain",		"txt"},
+	{"text/xml",		"xml"},
+
+	{NULL, NULL}
+};
+
 void
 siginfo_handler(int sig)
 {
@@ -51,102 +73,6 @@ starts_with(const char *str, const char *prefix)
 	return 1;
 }
 
-char *
-url_after_proto(char *url)
-{
-	char *s;
-	const char *proto = "gemini:";
-	const char *marker = "//";
-
-	/* a relative URL */
-	if ((s = strstr(url, marker)) == NULL)
-		return url;
-
-	/*
-	 * if a protocol is not specified, gemini should be implied:
-	 * this handles the case of //example.com
-	 */
-	if (s == url)
-		return s + strlen(marker);
-
-	if (s - strlen(proto) != url)
-		return NULL;
-
-	if (!starts_with(url, proto))
-		return NULL;
-
-	return s + strlen(marker);
-}
-
-char *
-url_start_of_request(char *url)
-{
-	char *s, *t;
-
-	if ((s = url_after_proto(url)) == NULL)
-		return NULL;
-
-	/* non-absolute URL */
-	if (s == url)
-		return s;
-
-	if ((t = strstr(s, "/")) == NULL)
-		return s + strlen(s);
-	return t;
-}
-
-int
-url_trim(struct client *c, char *url)
-{
-	const char *e = "\r\n";
-	char *s;
-
-	if ((s = strstr(url, e)) == NULL)
-		return 0;
-	s[0] = '\0';
-	s[1] = '\0';
-
-	if (s[2] != '\0') {
-		LOGE(c, "%s", "request longer than 1024 bytes");
-		return 0;
-	}
-
-	return 1;
-}
-
-char *
-adjust_path(char *path)
-{
-	char *s, *query;
-	size_t len;
-
-	if ((query = strchr(path, '?')) != NULL) {
-		*query = '\0';
-		query++;
-	}
-
-	/* /.. -> / */
-	len = strlen(path);
-	if (len >= 3) {
-		if (!strcmp(&path[len-3], "/..")) {
-			path[len-2] = '\0';
-		}
-	}
-
-	/* if the path is only `..` trim out and exit */
-	if (!strcmp(path, "..")) {
-		path[0] = '\0';
-		return query;
-	}
-
-	/* remove every ../ in the path */
-	while (1) {
-		if ((s = strstr(path, "../")) == NULL)
-			return query;
-		memmove(s, s+3, strlen(s)+1);	/* copy also the \0 */
-	}
-}
-
 int
 start_reply(struct pollfd *pfd, struct client *client, int code, const char *reason)
 {
@@ -224,7 +150,7 @@ check_path(struct client *c, const char *path, int *fd)
 	struct stat sb;
 
 	assert(path != NULL);
-	if ((*fd = openat(dirfd, path,
+	if ((*fd = openat(dirfd, *path ? path : ".",
 		    O_RDONLY | O_NOFOLLOW | O_CLOEXEC)) == -1) {
 		return FILE_MISSING;
 	}
@@ -288,16 +214,8 @@ err:
 
 
 int
-open_file(char *path, char *query, struct pollfd *fds, struct client *c)
+open_file(char *fpath, char *query, struct pollfd *fds, struct client *c)
 {
-	char fpath[PATHBUF];
-
-	bzero(fpath, sizeof(fpath));
-
-	if (*path != '.')
-		fpath[0] = '.';
-	strlcat(fpath, path, PATHBUF);
-
 	switch (check_path(c, fpath, &c->fd)) {
 	case FILE_EXECUTABLE:
 		/* +2 to skip the ./ */
@@ -578,8 +496,8 @@ void
 handle(struct pollfd *fds, struct client *client)
 {
 	char buf[GEMINI_URL_LEN];
-	char *path;
-	char *query;
+	const char *parse_err;
+	struct uri uri;
 
 	switch (client->state) {
 	case S_OPEN:
@@ -599,26 +517,19 @@ handle(struct pollfd *fds, struct client *client)
 			return;
 		}
 
-		if (!url_trim(client, buf)) {
-			if (!start_reply(fds, client, BAD_REQUEST, "bad request"))
-				return;
-			goodbye(fds, client);
-			return;
-		}
-
-		if ((path = url_start_of_request(buf)) == NULL) {
-			if (!start_reply(fds, client, BAD_REQUEST, "bad request"))
+		if (!trim_req_uri(buf) || !parse_uri(buf, &uri, &parse_err)) {
+			if (!start_reply(fds, client, BAD_REQUEST, parse_err))
 				return;
 			goodbye(fds, client);
 			return;
 		}
 
-		query = adjust_path(path);
-		LOGI(client, "GET %s%s%s", path,
-		    query ? "?" : "",
-		    query ? query : "");
+		LOGI(client, "GET %s%s%s",
+		    *uri.path ? uri.path : "/",
+		    *uri.query ? "?" : "",
+		    *uri.query ? uri.query : "");
 
-		send_file(path, query, fds, client);
+		send_file(uri.path, uri.query, fds, client);
 		break;
 
 	case S_INITIALIZING:
diff --git a/gmid.h b/gmid.h
index 62288a8..d8a050b 100644
--- a/gmid.h
+++ b/gmid.h
@@ -107,6 +107,17 @@ struct client {
 	struct in_addr	 addr;
 };
 
+
+struct uri {
+	char		*schema;
+	char		*host;
+	char		*port;
+	uint16_t	 port_no;
+	char		*path;
+	char		*query;
+	char		*fragment;
+};
+
 enum {
 	FILE_EXISTS,
 	FILE_EXECUTABLE,
@@ -114,35 +125,10 @@ enum {
 	FILE_MISSING,
 };
 
-struct etm {			/* file extension to mime */
-	const char	*mime;
-	const char	*ext;
-} filetypes[] = {
-	{"application/pdf",	"pdf"},
-
-	{"image/gif",		"gif"},
-	{"image/jpeg",		"jpg"},
-	{"image/jpeg",		"jpeg"},
-	{"image/png",		"png"},
-	{"image/svg+xml",	"svg"},
-
-	{"text/gemini",		"gemini"},
-	{"text/gemini",		"gmi"},
-	{"text/markdown",	"markdown"},
-	{"text/markdown",	"md"},
-	{"text/plain",		"txt"},
-	{"text/xml",		"xml"},
-
-	{NULL, NULL}
-};
-
+/* gmid.c */
 void		 siginfo_handler(int);
 int		 starts_with(const char*, const char*);
 
-char		*url_after_proto(char*);
-char		*url_start_of_request(char*);
-int		 url_trim(struct client*, char*);
-char		*adjust_path(char*);
 ssize_t		 filesize(int);
 
 int		 start_reply(struct pollfd*, struct client*, int, const char*);
@@ -167,4 +153,8 @@ void		 loop(struct tls*, int);
 
 void		 usage(const char*);
 
+/* uri.c */
+int		 parse_uri(char*, struct uri*, const char**);
+int		 trim_req_uri(char*);
+
 #endif
diff --git a/uri.c b/uri.c
new file mode 100644
index 0000000..245928a
--- /dev/null
+++ b/uri.c
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2020 Omar Polo <op@omarpolo.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <ctype.h>
+#include <string.h>
+
+#include "gmid.h"
+
+/*
+ * Notes from RFC3986
+ *
+ * => gemini://tanso.net/rfc/rfc3986.txt
+ *
+ *
+ * ABNF
+ * ====
+ *
+ * pct-encoded	"%" HEXDIG HEXDIG
+ *
+ * reserved	= gen-delims / sub-delimis
+ * gen-delims	= ":" / "/" / "?" / "#" / "[" / "]" / "@"
+ * sub-delims	= "!" / "$" / "&" / "'" / "(" / ")"
+ * 		/ "*" / "+" / "," / ";" / "="
+ *
+ * unreserved	= ALPHA / DIGIT / "-" / "." / "_" / "~"
+ *
+ * URI		= scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+ *
+ * hier-part	= "//" authority path-abempty
+ * 		/ path-absolute
+ * 		/ path-rootless
+ * 		/ path-empty
+ *
+ * scheme	= ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+ *
+ * authority	= [ userinfo "@" ] host [ ":" port ]
+ *
+ * (note that userinfo isn't used for Gemini URL)
+ *
+ * host		= IP-literal / IPv4address / reg-name
+ * reg-name	= *( unreserved / pct-encoded / sub-delims )
+ *
+ * port		= *DIGIT
+ *
+ * path		= path-abemty	; begins with "/" or is empty
+ * 		/ path-absolute	; begins with "/" but not "//"
+ * 		/ path-noscheme	; begins with a non-colon segment
+ * 		/ path-rootless ; begins with a segment
+ * 		/ path-empty	; zero characters
+ *
+ * path-abemty		= *( "/" segment )
+ * path-absolute	= "/" [ segment-nz *( "/" segment ) ]
+ * path-noscheme	= ; not used
+ * path-rootless	= ; not used
+ * path-empty		= ; not used
+ *
+ * segment		= *pchar
+ * segment-nz	= 1*pchar
+ * segment-nz-nc	= 1*( unreserved / pct-encoded / sub-delims / "@" )
+ * pchar		= unreserved / pct-encoded / sub-delims / ":" / "@"
+ *
+ * query		= *( pchar / "/" / "?" )
+ *
+ * fragment		= *( pchar / "/" / "?" )
+ *
+ *
+ * EXAMPLE
+ * =======
+ *
+ *    foo://example.com:8042/over/there?name=ferret#nose
+ *    \_/   \______________/\_________/ \_________/ \__/
+ *     |           |            |            |        |
+ *  scheme     authority       path        query   fragment
+ *
+ */
+
+struct parser {
+	char		*uri;
+	struct uri	*parsed;
+	const char	*err;
+};
+
+/* XXX: these macros will expand multiple times their argument */
+
+#define UNRESERVED(p)				\
+	(isalnum(p)				\
+	    || p == '-'				\
+	    || p == '.'				\
+	    || p == '_'				\
+	    || p == '~')
+
+#define SUB_DELIMITERS(p)			\
+	(p == '!'				\
+	    || p == '$'				\
+	    || p == '&'				\
+	    || p == '\''			\
+	    || p == '('				\
+	    || p == ')'				\
+	    || p == '*'				\
+	    || p == '+'				\
+	    || p == ','				\
+	    || p == ';'				\
+	    || p == '=')
+
+static int
+parse_pct_encoded(struct parser *p)
+{
+	if (*p->uri != '%')
+		return 0;
+
+	if (!isxdigit(*(p->uri+1)) || !isxdigit(*(p->uri+2))) {
+		p->err = "illegal percent-encoding";
+		return 0;
+	}
+
+	sscanf(p->uri+1, "%2hhx", p->uri);
+	memmove(p->uri+1, p->uri+3, strlen(p->uri+3)+1);
+
+	return 1;
+}
+
+/* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) "://" */
+static int
+parse_scheme(struct parser *p)
+{
+	p->parsed->schema = p->uri;
+
+	if (!isalpha(*p->uri)) {
+		p->err = "illegal character in scheme";
+		return 0;
+	}
+
+	p->uri++;
+	while (isalnum(*p->uri)
+	    || *p->uri == '+'
+	    || *p->uri == '-'
+	    || *p->uri == '.')
+		p->uri++;
+
+	if (*p->uri != ':') {
+		p->err = "illegal character in scheme";
+		return 0;
+	}
+
+	*p->uri = '\0';
+	if (*(++p->uri) != '/' || *(++p->uri) != '/') {
+		p->err = "invalid marker after scheme";
+		return 0;
+	}
+
+	p->uri++;
+	return 1;
+}
+
+/* *DIGIT */
+static int
+parse_port(struct parser *p)
+{
+	uint32_t i = 0;
+
+	p->parsed->port = p->uri;
+
+	for (; isdigit(*p->uri); p->uri++) {
+		i = i * 10 + *p->uri - '0';
+		if (i > UINT16_MAX) {
+			p->err = "port number too large";
+			return 0;
+		}
+	}
+
+	if (*p->uri != '/' && *p->uri != '\0') {
+		p->err = "illegal character in port number";
+		return 0;
+	}
+
+	p->parsed->port_no = i;
+
+	if (*p->uri != '\0') {
+		*p->uri = '\0';
+		p->uri++;
+	}
+
+	return 1;
+}
+
+/* TODO: add support for ip-literal and ipv4addr ? */
+/* *( unreserved / sub-delims / pct-encoded ) */
+static int
+parse_authority(struct parser *p)
+{
+	p->parsed->host = p->uri;
+
+	while (UNRESERVED(*p->uri)
+	    || SUB_DELIMITERS(*p->uri)
+	    || parse_pct_encoded(p))
+		p->uri++;
+
+	if (*p->uri == ':') {
+		*p->uri = '\0';
+		p->uri++;
+		return parse_port(p);
+	}
+
+	if (*p->uri == '/') {
+		*p->uri = '\0';
+		p->uri++;
+		return 1;
+	}
+
+	if (*p->uri == '\0')
+		return 1;
+
+	p->err = "illegal character in authority section";
+	return 0;
+}
+
+/* Routine for path_clean.  Elide the pointed .. with the preceding
+ * element.  Return 0 if it's not possible.  incr is the length of
+ * the increment, 3 for ../ and 2 for .. */
+static int
+path_elide_dotdot(char *path, char *i, int incr)
+{
+	char *j;
+
+	if (i == path)
+		return 0;
+	for (j = i-2; j != path && *j != '/'; j--)
+                /* noop */ ;
+	if (*j == '/')
+		j++;
+	i += incr;
+	memmove(j, i, strlen(i)+1);
+	return 1;
+}
+
+/*
+ * Use an algorithm similar to the one implemented in go' path.Clean:
+ *
+ * 1. Replace multiple slashes with a single slash
+ * 2. Eliminate each . path name element
+ * 3. Eliminate each inner .. along with the non-.. element that precedes it
+ * 4. Eliminate trailing .. if possible or error (go would only discard)
+ *
+ * Unlike path.Clean, this function return the empty string if the
+ * original path is equivalent to "/".
+ */
+static int
+path_clean(char *path)
+{
+	char *i;
+
+	/* 1. replace multiple slashes with a single one */
+	for (i = path; *i; ++i) {
+		if (*i == '/' && *(i+1) == '/') {
+			memmove(i, i+1, strlen(i)); /* move also the \0 */
+			i--;
+		}
+	}
+
+	/* 2. eliminate each . path name element */
+	for (i = path; *i; ++i) {
+		if ((i == path || *i == '/') && *(i+1) == '.' &&
+		    *(i+2) == '/') {
+			/* move also the \0 */
+			memmove(i, i+2, strlen(i)-1);
+			i--;
+		}
+	}
+	if (!strcmp(path, ".") || !strcmp(path, "/.")) {
+		*path = '\0';
+		return 1;
+	}
+
+	/* 3. eliminate each inner .. along with the preceding non-.. */
+	for (i = strstr(path, "../"); i != NULL; i = strstr(path, ".."))
+		if (!path_elide_dotdot(path, i, 3))
+			return 0;
+
+	/* 4. eliminate trailing ..*/
+	if ((i = strstr(path, "..")) != NULL)
+		if (!path_elide_dotdot(path, i, 2))
+			return 0;
+
+	return 1;
+}
+
+static int
+parse_query(struct parser *p)
+{
+	p->parsed->query = p->uri;
+	if (*p->uri == '\0')
+		return 1;
+
+	while (UNRESERVED(*p->uri)
+	    || SUB_DELIMITERS(*p->uri)
+	    || *p->uri == '/'
+	    || *p->uri == '?'
+	    || parse_pct_encoded(p))
+		p->uri++;
+
+	if (*p->uri != '\0' && *p->uri != '#') {
+		p->err = "illegal character in query";
+		return 0;
+	}
+
+	if (*p->uri != '\0') {
+		*p->uri = '\0';
+		p->uri++;
+	}
+
+	return 1;
+}
+
+/* don't even bother */
+static int
+parse_fragment(struct parser *p)
+{
+	p->parsed->fragment = p->uri;
+	return 1;
+}
+
+/* XXX: is it too broad? */
+/* *(pchar / "/") */
+static int
+parse_path(struct parser *p)
+{
+	char c;
+
+	p->parsed->path = p->uri;
+	if (*p->uri == '\0') {
+		p->parsed->query = p->parsed->fragment = p->uri;
+		return 1;
+	}
+
+	while (UNRESERVED(*p->uri)
+	    || SUB_DELIMITERS(*p->uri)
+	    || *p->uri == '/'
+	    || parse_pct_encoded(p))
+		p->uri++;
+
+	if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') {
+		p->err = "illegal character in path";
+		return 0;
+	}
+
+	if (*p->uri != '\0') {
+		c = *p->uri;
+		*p->uri = '\0';
+		p->uri++;
+
+		if (c == '#') {
+			if (!parse_fragment(p))
+				return 0;
+		} else
+			if (!parse_query(p) || !parse_fragment(p))
+				return 0;
+	}
+
+	if (!path_clean(p->parsed->path)) {
+		p->err = "illegal path";
+		return 0;
+	}
+
+	return 1;
+}
+
+int
+parse_uri(char *uri, struct uri *ret, const char **err_ret)
+{
+	char *end;
+	struct parser p = {uri, ret, NULL};
+
+	bzero(ret, sizeof(*ret));
+
+	/* initialize optional stuff to the empty string */
+	end = uri + strlen(uri);
+	p.parsed->port = end;
+	p.parsed->path = end;
+	p.parsed->query = end;
+	p.parsed->fragment = end;
+
+	if (!parse_scheme(&p) || !parse_authority(&p) || !parse_path(&p)) {
+		*err_ret = p.err;
+		return 0;
+	}
+
+	*err_ret = NULL;
+	return 1;
+}
+
+int
+trim_req_uri(char *uri)
+{
+	char *i;
+
+	if ((i = strstr(uri, "\r\n")) == NULL)
+		return 0;
+	*i = '\0';
+	return 1;
+}
diff --git a/uri_test.c b/uri_test.c
new file mode 100644
index 0000000..c6521f6
--- /dev/null
+++ b/uri_test.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2020 Omar Polo <op@omarpolo.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "gmid.h"
+
+#define TEST(uri, fail, exp, descr)				\
+	if (!run_test(uri, fail, exp)) {			\
+		fprintf(stderr, "%s:%d: error: %s\n",		\
+		    __FILE__, __LINE__, descr);			\
+		exit(1);					\
+	}
+
+#define URI(schema, host, port, path, query, frag)		\
+	((struct uri){schema, host, port, 0, path, query, frag})
+
+#define DIFF(wanted, got, field)					\
+	if (wanted->field == NULL || got->field == NULL ||		\
+	    strcmp(wanted->field, got->field)) {			\
+		fprintf(stderr, #field ":\n\tgot: %s\n\twanted: %s\n",	\
+		    got->field, wanted->field);				\
+		return 0;						\
+	}
+
+#define PASS 0
+#define FAIL 1
+
+int
+diff_uri(struct uri *p, struct uri *exp)
+{
+        DIFF(p, exp, schema);
+        DIFF(p, exp, host);
+        DIFF(p, exp, port);
+        DIFF(p, exp, path);
+        DIFF(p, exp, query);
+        DIFF(p, exp, fragment);
+	return 1;
+}
+
+int
+run_test(const char *uri, int should_fail, struct uri expected)
+{
+	int failed, ok = 1;
+	char *uri_copy;
+	struct uri parsed;
+	const char *error;
+
+	if ((uri_copy = strdup(uri)) == NULL)
+		err(1, "strdup");
+
+	fprintf(stderr, "=> %s\n", uri);
+	failed = !parse_uri(uri_copy, &parsed, &error);
+
+	if (failed && should_fail)
+		goto done;
+
+	if (error != NULL)
+		fprintf(stderr, "> %s\n", error);
+
+	ok = !failed && !should_fail;
+	if (ok)
+		ok = diff_uri(&expected, &parsed);
+
+done:
+	free(uri_copy);
+	return ok;
+}
+
+int
+main(void)
+{
+	struct uri empty = {"", "", "", PASS, "", "", ""};
+
+	TEST("http://omarpolo.com",
+	    PASS,
+	    URI("http", "omarpolo.com", "", "", "", ""),
+	    "can parse uri with empty path");
+
+	/* schema */
+	TEST("omarpolo.com", FAIL, empty, "FAIL when the schema is missing");
+	TEST("gemini:/omarpolo.com", FAIL, empty, "FAIL with invalid marker");
+	TEST("gemini//omarpolo.com", FAIL, empty, "FAIL with invalid marker");
+	TEST("h!!p://omarpolo.com", FAIL, empty, "FAIL with invalid schema");
+
+	/* authority */
+	TEST("gemini://omarpolo.com",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "", "", "", ""),
+	    "can parse authority with empty path");
+	TEST("gemini://omarpolo.com/",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "", "", "", ""),
+	    "can parse authority with empty path (alt)")
+	TEST("gemini://omarpolo.com:1965",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "1965", "", "", ""),
+	    "can parse with port and empty path");
+	TEST("gemini://omarpolo.com:1965/",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "1965", "", "", ""),
+	    "can parse with port and empty path")
+	TEST("gemini://omarpolo.com:196s",
+	    FAIL,
+	    empty,
+	    "FAIL with invalid port number");
+
+	/* path */
+	TEST("gemini://omarpolo.com/foo/bar/baz",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
+	    "parse simple paths");
+	TEST("gemini://omarpolo.com/foo//bar///baz",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
+	    "parse paths with multiple slashes");
+	TEST("gemini://omarpolo.com/foo/./bar/./././baz",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
+	    "parse paths with . elements");
+	TEST("gemini://omarpolo.com/foo/bar/../bar/baz",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
+	    "parse paths with .. elements");
+	TEST("gemini://omarpolo.com/foo/../foo/bar/../bar/baz/../baz",
+	    PASS,
+	    URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
+	    "parse paths with multiple .. elements");
+	TEST("gemini://omarpolo.com/foo/..",
+	    PASS,
+            URI("gemini", "omarpolo.com", "", "", "", ""),
+	    "parse paths with a trailing ..");
+	TEST("gemini://omarpolo.com/foo/../",
+	    PASS,
+            URI("gemini", "omarpolo.com", "", "", "", ""),
+	    "parse paths with a trailing ..");
+	TEST("gemini://omarpolo.com/foo/../..",
+	    FAIL,
+            empty,
+	    "reject paths that would escape the root");
+
+	/* query */
+	TEST("foo://example.com/foo/?gne",
+	    PASS,
+	    URI("foo", "example.com", "", "foo/", "gne", ""),
+	    "parse query strings");
+	TEST("foo://example.com/foo/?gne&foo",
+	    PASS,
+	    URI("foo", "example.com", "", "foo/", "gne&foo", ""),
+	    "parse query strings");
+	TEST("foo://example.com/foo/?gne%2F",
+	    PASS,
+	    URI("foo", "example.com", "", "foo/", "gne/", ""),
+	    "parse query strings");
+
+	/* fragment */
+	TEST("foo://bar.co/#foo",
+	    PASS,
+	    URI("foo", "bar.co", "", "", "", "foo"),
+	    "can recognize fragments");
+
+	/* percent encoding */
+	TEST("foo://bar.com/caf%C3%A8.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "cafè.gmi", "", ""),
+	    "can decode");
+
+	return 0;
+}
author	Omar Polo <op@omarpolo.com>	2020-12-25 13:13:12 +0100
committer	Omar Polo <op@omarpolo.com>	2020-12-25 13:13:12 +0100
commit	33d32d1fd66a577f22f3f33f238e8dac44ec9995 (patch)
tree	f9010d36f92d9239d0b80c87d9b57ee10fd4776d
parent	d5aba4c791266e35cf79cec02dcd15267fb62f62 (diff)