From 30a29150c3e047258c7d754e93be7078dde0653c Mon Sep 17 00:00:00 2001 From: fef Date: Wed, 14 Jul 2021 16:30:19 +0200 Subject: [PATCH] implement UTF-8 conversion i have no idea whether this works, the only thing i have is PTSD --- include/neo/_types.h | 8 +- include/neo/utf.h | 102 ++++++++++++++++++ src/CMakeLists.txt | 2 + src/string/string.cmake | 3 + src/string/utf.c | 231 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 345 insertions(+), 1 deletion(-) create mode 100644 include/neo/utf.h create mode 100644 src/string/string.cmake create mode 100644 src/string/utf.c diff --git a/include/neo/_types.h b/include/neo/_types.h index 0bd0e79..95ab0a9 100644 --- a/include/neo/_types.h +++ b/include/neo/_types.h @@ -18,6 +18,9 @@ typedef __UINT64_TYPE__ u64; typedef __SIZE_TYPE__ usize; typedef __PTRDIFF_TYPE__ isize; +/** A single Unicode character (32 bits) */ +typedef u32 nchar; + typedef float f32; typedef double f64; typedef long double f128; @@ -35,12 +38,15 @@ struct _neo_nref { }; /** * A basic reference counter for data structures. - * Embed this into your data structure as the field `__neo_nref` and + * Embed this into your data structure as the field `__neo_nref`, initialize + * it using `nref_init`, and use `nget` and `nput` to increment/decrement the + * reference counter. */ typedef struct _neo_nref nref_t; #define NREF_FIELD nref_t __neo_nref struct _neo_string { + /* The *amount of Unicode code points*, NOT amount of bytes */ NLEN_FIELD(_len); NREF_FIELD; usize _capacity; diff --git a/include/neo/utf.h b/include/neo/utf.h new file mode 100644 index 0000000..7694518 --- /dev/null +++ b/include/neo/utf.h @@ -0,0 +1,102 @@ +/** See the end of this file for copyright and license terms. */ + +/** + * @file Conversion utilities for raw UTF. + * + * Note that libneo strings already have native UTF-8 support, so you only + * really need this if you explicitly need to deal with different encodings or + * raw bytes, in which case i am very sorry. + * + * Special attention must be paid to methods reading raw UTF from `char *` + * parameters, as they may read a few bytes beyond the memory region allocated + * for them if the string contains a malformed UTF sequence. + * + * Believe me when i say you do not want to use these. + */ + +#pragma once + +#include "neo/_types.h" + +/** + * Check whether a NUL terminated string is valid UTF-8. + * + * If the string contains any malformed code sequences, an error is yeeted. + * + * @param s: String to validate + * @param err: Error pointer + * @returns The number of UTF-8 code points (i.e. number of Unicode characters) + * excluding the terminating NUL byte; undefined on error + */ +usize utf8_check(const char *restrict s, error *err); + +/** + * Compute the length of a raw UTF-8 encoded, NUL terminated string. + * + * The string is *not* checked for malformed code sequences, + * use `utf8_check` for that. + * + * @param s: String to get the length of + * @returns: String length as in Unicode code points (not bytes), + * excluding the terminating NUL byte + */ +usize utf8_strlen(const char *restrict s); + +/** + * Get the amount of bytes a Unicode character takes up in UTF-8. + * + * If the character is outside of the Unicode range (`0x00000000`~`0x0010ffff`), + * an error is yeeted. + * + * @param c: The character + * @param err: Error pointer + * @returns The amount of bytes needed to store the character in UTF-8 encoding, + * which is always between 1 and 4 except on errors + */ +usize utf8_chrsize(nchar c, error *err); + +/** + * UTF-8 encode a Unicode character and store it in `dest` without terminator. + * + * The buffer needs to hold at least 4 bytes. If the character is outside of + * the Unicode range (`0x00000000`~`0x0010ffff`), an error is yeeted and the + * buffer is not modified. + * + * @param dest: Where to store the encoded character (*not* NUL terminated) + * @param c: Character to encode + * @param err: Error pointer + * @returns The amount of bytes taken up by the character, + * which is always between 1 and 4 except on errors + */ +usize utf8_from_nchr(char *restrict dest, nchar c, error *err); + +/** + * Decode a UTF-8 character and store it in `c`. + * + * If the character encoding is malformed, an error is yeeted and `c` is set to + * the ASCII NUL character. The encoded character does not need to be NUL + * terminated. The amount of bytes read from the buffer depends on the first + * byte which marks the beginning of the Unicode code point. Keep in mind that + * this may cause the method to read up to 3 bytes over the end of the buffer + * if the code sequence is malformed. + * + * @param c: Where to store the decoded character + * @param utf8chr: UTF-8 encoded character sequence + * @param err: Error pointer + * @returns The amount of bytes the character took up when encoded as UTF-8, + * which is always between 1 and 4 except on errors + */ +usize utf8_to_nchr(nchar *c, const char *restrict utf8chr, error *err); + +/* + * This file is part of libneo. + * Copyright (c) 2021 Fefie . + * + * libneo is non-violent software: you may only use, redistribute, + * and/or modify it under the terms of the CNPLv6+ as found in + * the LICENSE file in the source code root directory or at + * . + * + * libneo comes with ABSOLUTELY NO WARRANTY, to the extent + * permitted by applicable law. See the CNPLv6+ for details. + */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 522b30c..3d8a927 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,6 +30,8 @@ target_sources(neo PRIVATE ./nref.c ) +include(./string/string.cmake) + # This file is part of libneo. # Copyright (c) 2021 Fefie . # diff --git a/src/string/string.cmake b/src/string/string.cmake new file mode 100644 index 0000000..993935b --- /dev/null +++ b/src/string/string.cmake @@ -0,0 +1,3 @@ +target_sources(neo PRIVATE + ./string/utf.c +) diff --git a/src/string/utf.c b/src/string/utf.c new file mode 100644 index 0000000..0d67dd5 --- /dev/null +++ b/src/string/utf.c @@ -0,0 +1,231 @@ +/** See the end of this file for copyright and license terms. */ + +/* + * The UTF-8 conversion functions are based on the branchless UTF-8 decoder by + * Christopher Wellons, which is in the public domain. For the original, see + * + */ + +#include +#include + +#include "neo/_error.h" +#include "neo/_nalloc.h" +#include "neo/_types.h" +#include "neo/utf.h" + +usize utf8_check(const char *restrict s, error *err) +{ + usize ret = 0; + nchar c; + + while (*s != '\0') { + ret++; + s += utf8_to_nchr(&c, s, err); + catch(err) { + break; + } + } + + return ret; +} + +usize utf8_strlen(const char *restrict s) +{ + usize len = 0; + + while (*s != '\0') + len += (*s++ & 0xc0) != 0x80; + + return len; +} + +usize utf8_nchr_size(nchar c, error *err) +{ + usize ret; + + if (c > 0x0010ffff) { + ret = 0; + yeet(err, EINVAL, "Character code not within Unicode range"); + } else { + ret = 1; + ret += c > 0x7f; + ret += c > 0x07ff; + ret += c > 0xffff; + neat(err); + } + + return ret; +} + +/* + * From RFC 3629, Section 3: + * + * Char. number range | UTF-8 octet sequence + * (hexadecimal) | (binary) + * --------------------+--------------------------------------------- + * 0000 0000-0000 007F | 0xxxxxxx + * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * See + */ + +usize utf8_from_nchr(char *restrict dest, nchar c, error *err) +{ + static const char prefixes[] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0 }; + + usize utf8_size = utf8_nchr_size(c, err); + catch(err) { + *dest = '\0'; + return 0; + } + + dest += utf8_size; + *dest-- = '\0'; + + switch (utf8_size) { + case 4: + *dest-- = (char)( 0x80 | (c & 0x3f) ); + c >>= 6; + /* fall through */ + case 3: + *dest-- = (char)( 0x80 | (c & 0x3f) ); + c >>= 6; + /* fall through */ + case 2: + *dest-- = (char)( 0x80 | (c & 0x3f) ); + c >>= 6; + /* fall through */ + case 1: + /* + * we don't need a bitmask for c here because utf8_nchr_size + * already did the validation work for us so we know that c + * doesn't have any upper bits it shouldn't + */ + *dest = (char)( prefixes[utf8_size] | c ); + break; + } + + return utf8_size; +} + +/* + * TODO: This (almost) branchless implementation is all fancy and shit, but it + * may read up to two bytes beyond the memory area allocated for the input + * buffer if it is passed a malformed code sequence. Someone (hopefully + * not me lmao) should decide whether potentially overreading 3 bytes is + * worth the considerable speed gain from this design. + */ + +usize utf8_to_nchr(nchar *dest, const char *restrict utf8chr, error *err) +{ + /* Expected sequence length per the 5 MSBs of the start byte */ + static const uint_fast8_t lengths[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0, + }; + /* Payload bitmask for first byte in the sequence per sequence length */ + static const char cmasks[] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 }; + /* Minimum Unicode values per sequence length */ + static const nchar mins[] = { 0xffffffff, 0, 0x80, 0x800, 0x10000 }; + /* Error bitmasks for (unused) bytes 2-4 per sequence length */ + static const uint_fast8_t emasks[] = { 0x00, 0x03, 0x07, 0x3f, 0xff }; + + uint_fast8_t cshift = 0; + /* + * 0xff bitmask just in case we are on a really odd + * architecture where char is more than one byte + */ + uint_fast8_t len = lengths[(utf8chr[0] >> 3) & 0xff]; + /* + * 7-6: two MSBs of the fourth byte in the sequence, must be 0b10 + * 5-4: two MSBs of the third byte in the sequence, must be 0b10 + * 3-2: two MSBs of the second byte in the sequence, must be 0b10 + * 1: whether the start byte is invalid + * 0: whether non-canonical encoding is used + */ + uint_fast8_t eflags = 0; + nchar c = '\0'; + + switch (len) { + case 4: + c = (nchar)(utf8chr[3] & 0x3f); /* 10xx xxxx */ + cshift += 6; + eflags |= utf8chr[3]; + /* fall through */ + case 3: + c |= (nchar)(utf8chr[2] & 0x3f) << cshift; /* 10xx xxxx */ + cshift += 6; + eflags |= (utf8chr[2] & 0xc0) >> 2; + /* fall through */ + case 2: + c |= (nchar)(utf8chr[1] & 0x3f) << cshift; /* 10xx xxxx */ + cshift += 6; + eflags |= (utf8chr[1] & 0xc0) >> 4; + /* fall through */ + case 1: + c |= (nchar)(utf8chr[0] & cmasks[len]) << cshift; + break; + case 0: + eflags |= 0x02; + break; + } + + /* UTF-8 mandates each char be stored in as few bytes as possible */ + eflags = c < mins[len]; + + /* + * Bytes 7-2 in eflags store the respective 2 MSBs of each tail byte + * in the sequence which must all start with 0b10, therefore eflags + * should be 0b101010xx if we have a four byte sequence. Toggling the + * bits we expect to be one (0xa8) zeores bytes 7-2 if they are correct, + * then we just need to mask out the bits that are unused if the + * sequence is less than 4 bytes (emasks). + */ + eflags ^= 0xa8; + eflags &= emasks[len]; + + if (eflags != 0) { + /* + * Errors are expected to be rare, so it's okay to use a bunch + * of if statements in favor of accurate error descriptions + * (and yeet is slow af anyway because it uses vsnprintf) + */ + *dest = '\0'; + if ((eflags & 0x01) != 0) { + yeet(err, EINVAL, + "Non canonical UTF-8 encoding: %lu byte character stored in %u bytes", + utf8_nchr_size(c, nil), len); + } else if ((eflags & 0x02) != 0) { + yeet(err, EINVAL, "Illegal UTF-8 sequence start byte: 0x%02x", utf8chr[0]); + } else if ((eflags & 0x0c) != 0) { + yeet(err, EINVAL, "Byte 2 in UTF-8 sequence invalid: 0x%02x", utf8chr[1]); + } else if ((eflags & 0x30) != 0) { + yeet(err, EINVAL, "Byte 3 in UTF-8 sequence invalid: 0x%02x", utf8chr[2]); + } else if ((eflags & 0xc0) != 0) { + yeet(err, EINVAL, "Byte 4 in UTF-8 sequence invalid: 0x%02x", utf8chr[3]); + } else { + yeet(err, EINVAL, "Unexpected decoding error"); + } + } else { + *dest = c; + neat(err); + } + + return len; +} + +/* + * This file is part of libneo. + * Copyright (c) 2021 Fefie . + * + * libneo is non-violent software: you may only use, redistribute, + * and/or modify it under the terms of the CNPLv6+ as found in + * the LICENSE file in the source code root directory or at + * . + * + * libneo comes with ABSOLUTELY NO WARRANTY, to the extent + * permitted by applicable law. See the CNPLv6+ for details. + */