implement UTF-8 conversion

i have no idea whether this works, the only thing i have is PTSD
2021-07-14 16:30:19 +02:00 · 2021-07-14 16:30:19 +02:00 · 30a29150c3
commit 30a29150c3
parent bd9297febe
5 changed files with 345 additions and 1 deletions
--- a/include/neo/_types.h
+++ b/include/neo/_types.h
@ -18,6 +18,9 @@ typedef __UINT64_TYPE__		u64;
 typedef __SIZE_TYPE__		usize;
 typedef __PTRDIFF_TYPE__	isize;

+/** A single Unicode character (32 bits) */
+typedef u32			nchar;
+
 typedef float		f32;
 typedef double		f64;
 typedef long double	f128;
@ -35,12 +38,15 @@ struct _neo_nref {
 };
 /**
 * A basic reference counter for data structures.
- * Embed this into your data structure as the field `__neo_nref` and
+ * Embed this into your data structure as the field `__neo_nref`, initialize
+ * it using `nref_init`, and use `nget` and `nput` to increment/decrement the
+ * reference counter.
 */
 typedef struct _neo_nref nref_t;
 #define NREF_FIELD nref_t __neo_nref

 struct _neo_string {
+	/* The *amount of Unicode code points*, NOT amount of bytes */
 	NLEN_FIELD(_len);
 	NREF_FIELD;
 	usize _capacity;
--- a/include/neo/utf.h
+++ b/include/neo/utf.h
@ -0,0 +1,102 @@
+/** See the end of this file for copyright and license terms. */
+
+/**
+ * @file Conversion utilities for raw UTF.
+ *
+ * Note that libneo strings already have native UTF-8 support, so you only
+ * really need this if you explicitly need to deal with different encodings or
+ * raw bytes, in which case i am very sorry.
+ *
+ * Special attention must be paid to methods reading raw UTF from `char *`
+ * parameters, as they may read a few bytes beyond the memory region allocated
+ * for them if the string contains a malformed UTF sequence.
+ *
+ * Believe me when i say you do not want to use these.
+ */
+
+#pragma once
+
+#include "neo/_types.h"
+
+/**
+ * Check whether a NUL terminated string is valid UTF-8.
+ *
+ * If the string contains any malformed code sequences, an error is yeeted.
+ *
+ * @param s: String to validate
+ * @param err: Error pointer
+ * @returns The number of UTF-8 code points (i.e. number of Unicode characters)
+ *	excluding the terminating NUL byte; undefined on error
+ */
+usize utf8_check(const char *restrict s, error *err);
+
+/**
+ * Compute the length of a raw UTF-8 encoded, NUL terminated string.
+ *
+ * The string is *not* checked for malformed code sequences,
+ * use `utf8_check` for that.
+ *
+ * @param s: String to get the length of
+ * @returns: String length as in Unicode code points (not bytes),
+ *	excluding the terminating NUL byte
+ */
+usize utf8_strlen(const char *restrict s);
+
+/**
+ * Get the amount of bytes a Unicode character takes up in UTF-8.
+ *
+ * If the character is outside of the Unicode range (`0x00000000`~`0x0010ffff`),
+ * an error is yeeted.
+ *
+ * @param c: The character
+ * @param err: Error pointer
+ * @returns The amount of bytes needed to store the character in UTF-8 encoding,
+ *	which is always between 1 and 4 except on errors
+ */
+usize utf8_chrsize(nchar c, error *err);
+
+/**
+ * UTF-8 encode a Unicode character and store it in `dest` without terminator.
+ *
+ * The buffer needs to hold at least 4 bytes.  If the character is outside of
+ * the Unicode range (`0x00000000`~`0x0010ffff`), an error is yeeted and the
+ * buffer is not modified.
+ *
+ * @param dest: Where to store the encoded character (*not* NUL terminated)
+ * @param c: Character to encode
+ * @param err: Error pointer
+ * @returns The amount of bytes taken up by the character,
+ *	which is always between 1 and 4 except on errors
+ */
+usize utf8_from_nchr(char *restrict dest, nchar c, error *err);
+
+/**
+ * Decode a UTF-8 character and store it in `c`.
+ *
+ * If the character encoding is malformed, an error is yeeted and `c` is set to
+ * the ASCII NUL character.  The encoded character does not need to be NUL
+ * terminated.  The amount of bytes read from the buffer depends on the first
+ * byte which marks the beginning of the Unicode code point.  Keep in mind that
+ * this may cause the method to read up to 3 bytes over the end of the buffer
+ * if the code sequence is malformed.
+ *
+ * @param c: Where to store the decoded character
+ * @param utf8chr: UTF-8 encoded character sequence
+ * @param err: Error pointer
+ * @returns The amount of bytes the character took up when encoded as UTF-8,
+ *	which is always between 1 and 4 except on errors
+ */
+usize utf8_to_nchr(nchar *c, const char *restrict utf8chr, error *err);
+
+/*
+ * This file is part of libneo.
+ * Copyright (c) 2021 Fefie <owo@fef.moe>.
+ *
+ * libneo is non-violent software: you may only use, redistribute,
+ * and/or modify it under the terms of the CNPLv6+ as found in
+ * the LICENSE file in the source code root directory or at
+ * <https://git.pixie.town/thufie/CNPL>.
+ *
+ * libneo comes with ABSOLUTELY NO WARRANTY, to the extent
+ * permitted by applicable law.  See the CNPLv6+ for details.
+ */
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -30,6 +30,8 @@ target_sources(neo PRIVATE
    ./nref.c
 )

+include(./string/string.cmake)
+
 # This file is part of libneo.
 # Copyright (c) 2021 Fefie <owo@fef.moe>.
 #
--- a/src/string/string.cmake
+++ b/src/string/string.cmake
@ -0,0 +1,3 @@
+target_sources(neo PRIVATE
+    ./string/utf.c
+)
--- a/src/string/utf.c
+++ b/src/string/utf.c
@ -0,0 +1,231 @@
+/** See the end of this file for copyright and license terms. */
+
+/*
+ * The UTF-8 conversion functions are based on the branchless UTF-8 decoder by
+ * Christopher Wellons, which is in the public domain.  For the original, see
+ * <https://github.com/skeeto/branchless-utf8/blob/f2d0e24c3864d726cd009901726df4778ad3e0d5/utf8.h>
+ */
+
+#include <errno.h>
+#include <stdint.h>
+
+#include "neo/_error.h"
+#include "neo/_nalloc.h"
+#include "neo/_types.h"
+#include "neo/utf.h"
+
+usize utf8_check(const char *restrict s, error *err)
+{
+	usize ret = 0;
+	nchar c;
+
+	while (*s != '\0') {
+		ret++;
+		s += utf8_to_nchr(&c, s, err);
+		catch(err) {
+			break;
+		}
+	}
+
+	return ret;
+}
+
+usize utf8_strlen(const char *restrict s)
+{
+	usize len = 0;
+
+	while (*s != '\0')
+		len += (*s++ & 0xc0) != 0x80;
+
+	return len;
+}
+
+usize utf8_nchr_size(nchar c, error *err)
+{
+	usize ret;
+
+	if (c > 0x0010ffff) {
+		ret = 0;
+		yeet(err, EINVAL, "Character code not within Unicode range");
+	} else {
+		ret = 1;
+		ret += c > 0x7f;
+		ret += c > 0x07ff;
+		ret += c > 0xffff;
+		neat(err);
+	}
+
+	return ret;
+}
+
+/*
+ * From RFC 3629, Section 3:
+ *
+ * Char. number range  |        UTF-8 octet sequence
+ *    (hexadecimal)    |              (binary)
+ * --------------------+---------------------------------------------
+ * 0000 0000-0000 007F | 0xxxxxxx
+ * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * See <https://datatracker.ietf.org/doc/html/rfc3629#section-3>
+ */
+
+usize utf8_from_nchr(char *restrict dest, nchar c, error *err)
+{
+	static const char prefixes[] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0 };
+
+	usize utf8_size = utf8_nchr_size(c, err);
+	catch(err) {
+		*dest = '\0';
+		return 0;
+	}
+
+	dest += utf8_size;
+	*dest-- = '\0';
+
+	switch (utf8_size) {
+	case 4:
+		*dest-- = (char)( 0x80 | (c & 0x3f) );
+		c >>= 6;
+		/* fall through */
+	case 3:
+		*dest-- = (char)( 0x80 | (c & 0x3f) );
+		c >>= 6;
+		/* fall through */
+	case 2:
+		*dest-- = (char)( 0x80 | (c & 0x3f) );
+		c >>= 6;
+		/* fall through */
+	case 1:
+		/*
+		 * we don't need a bitmask for c here because utf8_nchr_size
+		 * already did the validation work for us so we know that c
+		 * doesn't have any upper bits it shouldn't
+		 */
+		*dest = (char)( prefixes[utf8_size] | c );
+		break;
+	}
+
+	return utf8_size;
+}
+
+/*
+ * TODO: This (almost) branchless implementation is all fancy and shit, but it
+ *       may read up to two bytes beyond the memory area allocated for the input
+ *       buffer if it is passed a malformed code sequence.  Someone (hopefully
+ *       not me lmao) should decide whether potentially overreading 3 bytes is
+ *       worth the considerable speed gain from this design.
+ */
+
+usize utf8_to_nchr(nchar *dest, const char *restrict utf8chr, error *err)
+{
+	/* Expected sequence length per the 5 MSBs of the start byte */
+	static const uint_fast8_t lengths[] = {
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0,
+	};
+	/* Payload bitmask for first byte in the sequence per sequence length */
+	static const char         cmasks[] = {       0x00, 0x7f, 0x1f,  0x0f,    0x07 };
+	/* Minimum Unicode values per sequence length */
+	static const nchar        mins[]   = { 0xffffffff,    0, 0x80, 0x800, 0x10000 };
+	/* Error bitmasks for (unused) bytes 2-4 per sequence length */
+	static const uint_fast8_t emasks[] = {       0x00, 0x03, 0x07,  0x3f,    0xff };
+
+	uint_fast8_t cshift = 0;
+	/*
+	 * 0xff bitmask just in case we are on a really odd
+	 * architecture where char is more than one byte
+	 */
+	uint_fast8_t len = lengths[(utf8chr[0] >> 3) & 0xff];
+	/*
+	 * 7-6: two MSBs of the fourth byte in the sequence, must be 0b10
+	 * 5-4: two MSBs of the third byte in the sequence, must be 0b10
+	 * 3-2: two MSBs of the second byte in the sequence, must be 0b10
+	 * 1:   whether the start byte is invalid
+	 * 0:   whether non-canonical encoding is used
+	 */
+	uint_fast8_t eflags = 0;
+	nchar c = '\0';
+
+	switch (len) {
+	case 4:
+		c = (nchar)(utf8chr[3] & 0x3f);			/* 10xx xxxx */
+		cshift += 6;
+		eflags |= utf8chr[3];
+		/* fall through */
+	case 3:
+		c |= (nchar)(utf8chr[2] & 0x3f) << cshift;	/* 10xx xxxx */
+		cshift += 6;
+		eflags |= (utf8chr[2] & 0xc0) >> 2;
+		/* fall through */
+	case 2:
+		c |= (nchar)(utf8chr[1] & 0x3f) << cshift;	/* 10xx xxxx */
+		cshift += 6;
+		eflags |= (utf8chr[1] & 0xc0) >> 4;
+		/* fall through */
+	case 1:
+		c |= (nchar)(utf8chr[0] & cmasks[len]) << cshift;
+		break;
+	case 0:
+		eflags |= 0x02;
+		break;
+	}
+
+	/* UTF-8 mandates each char be stored in as few bytes as possible */
+	eflags = c < mins[len];
+
+	/*
+	 * Bytes 7-2 in eflags store the respective 2 MSBs of each tail byte
+	 * in the sequence which must all start with 0b10, therefore eflags
+	 * should be 0b101010xx if we have a four byte sequence.  Toggling the
+	 * bits we expect to be one (0xa8) zeores bytes 7-2 if they are correct,
+	 * then we just need to mask out the bits that are unused if the
+	 * sequence is less than 4 bytes (emasks).
+	 */
+	eflags ^= 0xa8;
+	eflags &= emasks[len];
+
+	if (eflags != 0) {
+		/*
+		 * Errors are expected to be rare, so it's okay to use a bunch
+		 * of if statements in favor of accurate error descriptions
+		 * (and yeet is slow af anyway because it uses vsnprintf)
+		 */
+		*dest = '\0';
+		if ((eflags & 0x01) != 0) {
+			yeet(err, EINVAL,
+			     "Non canonical UTF-8 encoding: %lu byte character stored in %u bytes",
+			     utf8_nchr_size(c, nil), len);
+		} else if ((eflags & 0x02) != 0) {
+			yeet(err, EINVAL, "Illegal UTF-8 sequence start byte: 0x%02x", utf8chr[0]);
+		} else if ((eflags & 0x0c) != 0) {
+			yeet(err, EINVAL, "Byte 2 in UTF-8 sequence invalid: 0x%02x", utf8chr[1]);
+		} else if ((eflags & 0x30) != 0) {
+			yeet(err, EINVAL, "Byte 3 in UTF-8 sequence invalid: 0x%02x", utf8chr[2]);
+		} else if ((eflags & 0xc0) != 0) {
+			yeet(err, EINVAL, "Byte 4 in UTF-8 sequence invalid: 0x%02x", utf8chr[3]);
+		} else {
+			yeet(err, EINVAL, "Unexpected decoding error");
+		}
+	} else {
+		*dest = c;
+		neat(err);
+	}
+
+	return len;
+}
+
+/*
+ * This file is part of libneo.
+ * Copyright (c) 2021 Fefie <owo@fef.moe>.
+ *
+ * libneo is non-violent software: you may only use, redistribute,
+ * and/or modify it under the terms of the CNPLv6+ as found in
+ * the LICENSE file in the source code root directory or at
+ * <https://git.pixie.town/thufie/CNPL>.
+ *
+ * libneo comes with ABSOLUTELY NO WARRANTY, to the extent
+ * permitted by applicable law.  See the CNPLv6+ for details.
+ */