switch to Bjoern Hoehrmann UTF-8 decoder

It's correct, while my hacked valid_multibyte_utf8 would allow things that aren't technically UTF8.
author: Omar Polo <op@omarpolo.com> 2021-01-09 20:32:23 +0000
committer: Omar Polo <op@omarpolo.com> 2021-01-09 20:32:23 +0000
commit: ef04b55160759b22db67f14c703a4343c4741e8b (patch)
tree: fff26b0d56a0506b2ef0ebb84ffa50f2d655ebe6 /utf8.c
parent: 578ba2d81b2fa5f839314190f42bb9116069db7a (diff)
1 files changed, 79 insertions, 0 deletions
diff --git a/utf8.c b/utf8.c
new file mode 100644
index 0000000..81a7584
--- /dev/null
+++ b/utf8.c
@@ -0,0 +1,79 @@
+/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "gmid.h"
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+static uint32_t inline
+utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
+	uint32_t type = utf8d[byte];
+
+	*codep = (*state != UTF8_ACCEPT) ?
+		(byte & 0x3fu) | (*codep << 6) :
+		(0xff >> type) & (byte);
+
+	*state = utf8d[256 + *state*16 + type];
+	return *state;
+}
+
+/* for the iri parser.  Modelled after printCodePoints */
+int
+valid_multibyte_utf8(struct parser *p)
+{
+	uint32_t cp = 0, state = 0;
+
+        for (; *p->uri; p->uri++)
+		if (!utf8_decode(&state, &cp, *p->uri))
+			break;
+
+	/* reject the ASCII range */
+	if (state || cp <= 0x7F) {
+		/* XXX: do some error recovery? */
+		if (state)
+			p->err = "invalid UTF-8 character";
+		return 0;
+	}
+	return 1;
+}
author	Omar Polo <op@omarpolo.com>	2021-01-09 20:32:23 +0000
committer	Omar Polo <op@omarpolo.com>	2021-01-09 20:32:23 +0000
commit	ef04b55160759b22db67f14c703a4343c4741e8b (patch)
tree	fff26b0d56a0506b2ef0ebb84ffa50f2d655ebe6 /utf8.c
parent	578ba2d81b2fa5f839314190f42bb9116069db7a (diff)