implement UTF-8 conversion

i have no idea whether this works, the only thing i have is PTSD
This commit is contained in:
anna 2021-07-14 16:30:19 +02:00
parent bd9297febe
commit 30a29150c3
Signed by: fef
GPG key ID: EC22E476DC2D3D84
5 changed files with 345 additions and 1 deletions

View file

@ -18,6 +18,9 @@ typedef __UINT64_TYPE__ u64;
typedef __SIZE_TYPE__ usize;
typedef __PTRDIFF_TYPE__ isize;
/** A single Unicode character (32 bits) */
typedef u32 nchar;
typedef float f32;
typedef double f64;
typedef long double f128;
@ -35,12 +38,15 @@ struct _neo_nref {
};
/**
* A basic reference counter for data structures.
* Embed this into your data structure as the field `__neo_nref` and
* Embed this into your data structure as the field `__neo_nref`, initialize
* it using `nref_init`, and use `nget` and `nput` to increment/decrement the
* reference counter.
*/
typedef struct _neo_nref nref_t;
#define NREF_FIELD nref_t __neo_nref
struct _neo_string {
/* The *amount of Unicode code points*, NOT amount of bytes */
NLEN_FIELD(_len);
NREF_FIELD;
usize _capacity;

102
include/neo/utf.h Normal file
View file

@ -0,0 +1,102 @@
/** See the end of this file for copyright and license terms. */
/**
* @file Conversion utilities for raw UTF.
*
* Note that libneo strings already have native UTF-8 support, so you only
* really need this if you explicitly need to deal with different encodings or
* raw bytes, in which case i am very sorry.
*
* Special attention must be paid to methods reading raw UTF from `char *`
* parameters, as they may read a few bytes beyond the memory region allocated
* for them if the string contains a malformed UTF sequence.
*
* Believe me when i say you do not want to use these.
*/
#pragma once
#include "neo/_types.h"
/**
* Check whether a NUL terminated string is valid UTF-8.
*
* If the string contains any malformed code sequences, an error is yeeted.
*
* @param s: String to validate
* @param err: Error pointer
* @returns The number of UTF-8 code points (i.e. number of Unicode characters)
* excluding the terminating NUL byte; undefined on error
*/
usize utf8_check(const char *restrict s, error *err);
/**
* Compute the length of a raw UTF-8 encoded, NUL terminated string.
*
* The string is *not* checked for malformed code sequences,
* use `utf8_check` for that.
*
* @param s: String to get the length of
* @returns: String length as in Unicode code points (not bytes),
* excluding the terminating NUL byte
*/
usize utf8_strlen(const char *restrict s);
/**
* Get the amount of bytes a Unicode character takes up in UTF-8.
*
* If the character is outside of the Unicode range (`0x00000000`~`0x0010ffff`),
* an error is yeeted.
*
* @param c: The character
* @param err: Error pointer
* @returns The amount of bytes needed to store the character in UTF-8 encoding,
* which is always between 1 and 4 except on errors
*/
usize utf8_chrsize(nchar c, error *err);
/**
* UTF-8 encode a Unicode character and store it in `dest` without terminator.
*
* The buffer needs to hold at least 4 bytes. If the character is outside of
* the Unicode range (`0x00000000`~`0x0010ffff`), an error is yeeted and the
* buffer is not modified.
*
* @param dest: Where to store the encoded character (*not* NUL terminated)
* @param c: Character to encode
* @param err: Error pointer
* @returns The amount of bytes taken up by the character,
* which is always between 1 and 4 except on errors
*/
usize utf8_from_nchr(char *restrict dest, nchar c, error *err);
/**
* Decode a UTF-8 character and store it in `c`.
*
* If the character encoding is malformed, an error is yeeted and `c` is set to
* the ASCII NUL character. The encoded character does not need to be NUL
* terminated. The amount of bytes read from the buffer depends on the first
* byte which marks the beginning of the Unicode code point. Keep in mind that
* this may cause the method to read up to 3 bytes over the end of the buffer
* if the code sequence is malformed.
*
* @param c: Where to store the decoded character
* @param utf8chr: UTF-8 encoded character sequence
* @param err: Error pointer
* @returns The amount of bytes the character took up when encoded as UTF-8,
* which is always between 1 and 4 except on errors
*/
usize utf8_to_nchr(nchar *c, const char *restrict utf8chr, error *err);
/*
* This file is part of libneo.
* Copyright (c) 2021 Fefie <owo@fef.moe>.
*
* libneo is non-violent software: you may only use, redistribute,
* and/or modify it under the terms of the CNPLv6+ as found in
* the LICENSE file in the source code root directory or at
* <https://git.pixie.town/thufie/CNPL>.
*
* libneo comes with ABSOLUTELY NO WARRANTY, to the extent
* permitted by applicable law. See the CNPLv6+ for details.
*/

View file

@ -30,6 +30,8 @@ target_sources(neo PRIVATE
./nref.c
)
include(./string/string.cmake)
# This file is part of libneo.
# Copyright (c) 2021 Fefie <owo@fef.moe>.
#

3
src/string/string.cmake Normal file
View file

@ -0,0 +1,3 @@
target_sources(neo PRIVATE
./string/utf.c
)

231
src/string/utf.c Normal file
View file

@ -0,0 +1,231 @@
/** See the end of this file for copyright and license terms. */
/*
* The UTF-8 conversion functions are based on the branchless UTF-8 decoder by
* Christopher Wellons, which is in the public domain. For the original, see
* <https://github.com/skeeto/branchless-utf8/blob/f2d0e24c3864d726cd009901726df4778ad3e0d5/utf8.h>
*/
#include <errno.h>
#include <stdint.h>
#include "neo/_error.h"
#include "neo/_nalloc.h"
#include "neo/_types.h"
#include "neo/utf.h"
usize utf8_check(const char *restrict s, error *err)
{
usize ret = 0;
nchar c;
while (*s != '\0') {
ret++;
s += utf8_to_nchr(&c, s, err);
catch(err) {
break;
}
}
return ret;
}
usize utf8_strlen(const char *restrict s)
{
usize len = 0;
while (*s != '\0')
len += (*s++ & 0xc0) != 0x80;
return len;
}
usize utf8_nchr_size(nchar c, error *err)
{
usize ret;
if (c > 0x0010ffff) {
ret = 0;
yeet(err, EINVAL, "Character code not within Unicode range");
} else {
ret = 1;
ret += c > 0x7f;
ret += c > 0x07ff;
ret += c > 0xffff;
neat(err);
}
return ret;
}
/*
* From RFC 3629, Section 3:
*
* Char. number range | UTF-8 octet sequence
* (hexadecimal) | (binary)
* --------------------+---------------------------------------------
* 0000 0000-0000 007F | 0xxxxxxx
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* See <https://datatracker.ietf.org/doc/html/rfc3629#section-3>
*/
usize utf8_from_nchr(char *restrict dest, nchar c, error *err)
{
static const char prefixes[] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0 };
usize utf8_size = utf8_nchr_size(c, err);
catch(err) {
*dest = '\0';
return 0;
}
dest += utf8_size;
*dest-- = '\0';
switch (utf8_size) {
case 4:
*dest-- = (char)( 0x80 | (c & 0x3f) );
c >>= 6;
/* fall through */
case 3:
*dest-- = (char)( 0x80 | (c & 0x3f) );
c >>= 6;
/* fall through */
case 2:
*dest-- = (char)( 0x80 | (c & 0x3f) );
c >>= 6;
/* fall through */
case 1:
/*
* we don't need a bitmask for c here because utf8_nchr_size
* already did the validation work for us so we know that c
* doesn't have any upper bits it shouldn't
*/
*dest = (char)( prefixes[utf8_size] | c );
break;
}
return utf8_size;
}
/*
* TODO: This (almost) branchless implementation is all fancy and shit, but it
* may read up to two bytes beyond the memory area allocated for the input
* buffer if it is passed a malformed code sequence. Someone (hopefully
* not me lmao) should decide whether potentially overreading 3 bytes is
* worth the considerable speed gain from this design.
*/
usize utf8_to_nchr(nchar *dest, const char *restrict utf8chr, error *err)
{
/* Expected sequence length per the 5 MSBs of the start byte */
static const uint_fast8_t lengths[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0,
};
/* Payload bitmask for first byte in the sequence per sequence length */
static const char cmasks[] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 };
/* Minimum Unicode values per sequence length */
static const nchar mins[] = { 0xffffffff, 0, 0x80, 0x800, 0x10000 };
/* Error bitmasks for (unused) bytes 2-4 per sequence length */
static const uint_fast8_t emasks[] = { 0x00, 0x03, 0x07, 0x3f, 0xff };
uint_fast8_t cshift = 0;
/*
* 0xff bitmask just in case we are on a really odd
* architecture where char is more than one byte
*/
uint_fast8_t len = lengths[(utf8chr[0] >> 3) & 0xff];
/*
* 7-6: two MSBs of the fourth byte in the sequence, must be 0b10
* 5-4: two MSBs of the third byte in the sequence, must be 0b10
* 3-2: two MSBs of the second byte in the sequence, must be 0b10
* 1: whether the start byte is invalid
* 0: whether non-canonical encoding is used
*/
uint_fast8_t eflags = 0;
nchar c = '\0';
switch (len) {
case 4:
c = (nchar)(utf8chr[3] & 0x3f); /* 10xx xxxx */
cshift += 6;
eflags |= utf8chr[3];
/* fall through */
case 3:
c |= (nchar)(utf8chr[2] & 0x3f) << cshift; /* 10xx xxxx */
cshift += 6;
eflags |= (utf8chr[2] & 0xc0) >> 2;
/* fall through */
case 2:
c |= (nchar)(utf8chr[1] & 0x3f) << cshift; /* 10xx xxxx */
cshift += 6;
eflags |= (utf8chr[1] & 0xc0) >> 4;
/* fall through */
case 1:
c |= (nchar)(utf8chr[0] & cmasks[len]) << cshift;
break;
case 0:
eflags |= 0x02;
break;
}
/* UTF-8 mandates each char be stored in as few bytes as possible */
eflags = c < mins[len];
/*
* Bytes 7-2 in eflags store the respective 2 MSBs of each tail byte
* in the sequence which must all start with 0b10, therefore eflags
* should be 0b101010xx if we have a four byte sequence. Toggling the
* bits we expect to be one (0xa8) zeores bytes 7-2 if they are correct,
* then we just need to mask out the bits that are unused if the
* sequence is less than 4 bytes (emasks).
*/
eflags ^= 0xa8;
eflags &= emasks[len];
if (eflags != 0) {
/*
* Errors are expected to be rare, so it's okay to use a bunch
* of if statements in favor of accurate error descriptions
* (and yeet is slow af anyway because it uses vsnprintf)
*/
*dest = '\0';
if ((eflags & 0x01) != 0) {
yeet(err, EINVAL,
"Non canonical UTF-8 encoding: %lu byte character stored in %u bytes",
utf8_nchr_size(c, nil), len);
} else if ((eflags & 0x02) != 0) {
yeet(err, EINVAL, "Illegal UTF-8 sequence start byte: 0x%02x", utf8chr[0]);
} else if ((eflags & 0x0c) != 0) {
yeet(err, EINVAL, "Byte 2 in UTF-8 sequence invalid: 0x%02x", utf8chr[1]);
} else if ((eflags & 0x30) != 0) {
yeet(err, EINVAL, "Byte 3 in UTF-8 sequence invalid: 0x%02x", utf8chr[2]);
} else if ((eflags & 0xc0) != 0) {
yeet(err, EINVAL, "Byte 4 in UTF-8 sequence invalid: 0x%02x", utf8chr[3]);
} else {
yeet(err, EINVAL, "Unexpected decoding error");
}
} else {
*dest = c;
neat(err);
}
return len;
}
/*
* This file is part of libneo.
* Copyright (c) 2021 Fefie <owo@fef.moe>.
*
* libneo is non-violent software: you may only use, redistribute,
* and/or modify it under the terms of the CNPLv6+ as found in
* the LICENSE file in the source code root directory or at
* <https://git.pixie.town/thufie/CNPL>.
*
* libneo comes with ABSOLUTELY NO WARRANTY, to the extent
* permitted by applicable law. See the CNPLv6+ for details.
*/