implement UTF-8 conversion
i have no idea whether this works, the only thing i have is PTSD
This commit is contained in:
parent
bd9297febe
commit
30a29150c3
5 changed files with 345 additions and 1 deletions
|
@ -18,6 +18,9 @@ typedef __UINT64_TYPE__ u64;
|
|||
typedef __SIZE_TYPE__ usize;
|
||||
typedef __PTRDIFF_TYPE__ isize;
|
||||
|
||||
/** A single Unicode character (32 bits) */
|
||||
typedef u32 nchar;
|
||||
|
||||
typedef float f32;
|
||||
typedef double f64;
|
||||
typedef long double f128;
|
||||
|
@ -35,12 +38,15 @@ struct _neo_nref {
|
|||
};
|
||||
/**
|
||||
* A basic reference counter for data structures.
|
||||
* Embed this into your data structure as the field `__neo_nref` and
|
||||
* Embed this into your data structure as the field `__neo_nref`, initialize
|
||||
* it using `nref_init`, and use `nget` and `nput` to increment/decrement the
|
||||
* reference counter.
|
||||
*/
|
||||
typedef struct _neo_nref nref_t;
|
||||
#define NREF_FIELD nref_t __neo_nref
|
||||
|
||||
struct _neo_string {
|
||||
/* The *amount of Unicode code points*, NOT amount of bytes */
|
||||
NLEN_FIELD(_len);
|
||||
NREF_FIELD;
|
||||
usize _capacity;
|
||||
|
|
102
include/neo/utf.h
Normal file
102
include/neo/utf.h
Normal file
|
@ -0,0 +1,102 @@
|
|||
/** See the end of this file for copyright and license terms. */
|
||||
|
||||
/**
|
||||
* @file Conversion utilities for raw UTF.
|
||||
*
|
||||
* Note that libneo strings already have native UTF-8 support, so you only
|
||||
* really need this if you explicitly need to deal with different encodings or
|
||||
* raw bytes, in which case i am very sorry.
|
||||
*
|
||||
* Special attention must be paid to methods reading raw UTF from `char *`
|
||||
* parameters, as they may read a few bytes beyond the memory region allocated
|
||||
* for them if the string contains a malformed UTF sequence.
|
||||
*
|
||||
* Believe me when i say you do not want to use these.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "neo/_types.h"
|
||||
|
||||
/**
|
||||
* Check whether a NUL terminated string is valid UTF-8.
|
||||
*
|
||||
* If the string contains any malformed code sequences, an error is yeeted.
|
||||
*
|
||||
* @param s: String to validate
|
||||
* @param err: Error pointer
|
||||
* @returns The number of UTF-8 code points (i.e. number of Unicode characters)
|
||||
* excluding the terminating NUL byte; undefined on error
|
||||
*/
|
||||
usize utf8_check(const char *restrict s, error *err);
|
||||
|
||||
/**
|
||||
* Compute the length of a raw UTF-8 encoded, NUL terminated string.
|
||||
*
|
||||
* The string is *not* checked for malformed code sequences,
|
||||
* use `utf8_check` for that.
|
||||
*
|
||||
* @param s: String to get the length of
|
||||
* @returns: String length as in Unicode code points (not bytes),
|
||||
* excluding the terminating NUL byte
|
||||
*/
|
||||
usize utf8_strlen(const char *restrict s);
|
||||
|
||||
/**
|
||||
* Get the amount of bytes a Unicode character takes up in UTF-8.
|
||||
*
|
||||
* If the character is outside of the Unicode range (`0x00000000`~`0x0010ffff`),
|
||||
* an error is yeeted.
|
||||
*
|
||||
* @param c: The character
|
||||
* @param err: Error pointer
|
||||
* @returns The amount of bytes needed to store the character in UTF-8 encoding,
|
||||
* which is always between 1 and 4 except on errors
|
||||
*/
|
||||
usize utf8_chrsize(nchar c, error *err);
|
||||
|
||||
/**
|
||||
* UTF-8 encode a Unicode character and store it in `dest` without terminator.
|
||||
*
|
||||
* The buffer needs to hold at least 4 bytes. If the character is outside of
|
||||
* the Unicode range (`0x00000000`~`0x0010ffff`), an error is yeeted and the
|
||||
* buffer is not modified.
|
||||
*
|
||||
* @param dest: Where to store the encoded character (*not* NUL terminated)
|
||||
* @param c: Character to encode
|
||||
* @param err: Error pointer
|
||||
* @returns The amount of bytes taken up by the character,
|
||||
* which is always between 1 and 4 except on errors
|
||||
*/
|
||||
usize utf8_from_nchr(char *restrict dest, nchar c, error *err);
|
||||
|
||||
/**
|
||||
* Decode a UTF-8 character and store it in `c`.
|
||||
*
|
||||
* If the character encoding is malformed, an error is yeeted and `c` is set to
|
||||
* the ASCII NUL character. The encoded character does not need to be NUL
|
||||
* terminated. The amount of bytes read from the buffer depends on the first
|
||||
* byte which marks the beginning of the Unicode code point. Keep in mind that
|
||||
* this may cause the method to read up to 3 bytes over the end of the buffer
|
||||
* if the code sequence is malformed.
|
||||
*
|
||||
* @param c: Where to store the decoded character
|
||||
* @param utf8chr: UTF-8 encoded character sequence
|
||||
* @param err: Error pointer
|
||||
* @returns The amount of bytes the character took up when encoded as UTF-8,
|
||||
* which is always between 1 and 4 except on errors
|
||||
*/
|
||||
usize utf8_to_nchr(nchar *c, const char *restrict utf8chr, error *err);
|
||||
|
||||
/*
|
||||
* This file is part of libneo.
|
||||
* Copyright (c) 2021 Fefie <owo@fef.moe>.
|
||||
*
|
||||
* libneo is non-violent software: you may only use, redistribute,
|
||||
* and/or modify it under the terms of the CNPLv6+ as found in
|
||||
* the LICENSE file in the source code root directory or at
|
||||
* <https://git.pixie.town/thufie/CNPL>.
|
||||
*
|
||||
* libneo comes with ABSOLUTELY NO WARRANTY, to the extent
|
||||
* permitted by applicable law. See the CNPLv6+ for details.
|
||||
*/
|
|
@ -30,6 +30,8 @@ target_sources(neo PRIVATE
|
|||
./nref.c
|
||||
)
|
||||
|
||||
include(./string/string.cmake)
|
||||
|
||||
# This file is part of libneo.
|
||||
# Copyright (c) 2021 Fefie <owo@fef.moe>.
|
||||
#
|
||||
|
|
3
src/string/string.cmake
Normal file
3
src/string/string.cmake
Normal file
|
@ -0,0 +1,3 @@
|
|||
target_sources(neo PRIVATE
|
||||
./string/utf.c
|
||||
)
|
231
src/string/utf.c
Normal file
231
src/string/utf.c
Normal file
|
@ -0,0 +1,231 @@
|
|||
/** See the end of this file for copyright and license terms. */
|
||||
|
||||
/*
|
||||
* The UTF-8 conversion functions are based on the branchless UTF-8 decoder by
|
||||
* Christopher Wellons, which is in the public domain. For the original, see
|
||||
* <https://github.com/skeeto/branchless-utf8/blob/f2d0e24c3864d726cd009901726df4778ad3e0d5/utf8.h>
|
||||
*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "neo/_error.h"
|
||||
#include "neo/_nalloc.h"
|
||||
#include "neo/_types.h"
|
||||
#include "neo/utf.h"
|
||||
|
||||
usize utf8_check(const char *restrict s, error *err)
|
||||
{
|
||||
usize ret = 0;
|
||||
nchar c;
|
||||
|
||||
while (*s != '\0') {
|
||||
ret++;
|
||||
s += utf8_to_nchr(&c, s, err);
|
||||
catch(err) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
usize utf8_strlen(const char *restrict s)
|
||||
{
|
||||
usize len = 0;
|
||||
|
||||
while (*s != '\0')
|
||||
len += (*s++ & 0xc0) != 0x80;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
usize utf8_nchr_size(nchar c, error *err)
|
||||
{
|
||||
usize ret;
|
||||
|
||||
if (c > 0x0010ffff) {
|
||||
ret = 0;
|
||||
yeet(err, EINVAL, "Character code not within Unicode range");
|
||||
} else {
|
||||
ret = 1;
|
||||
ret += c > 0x7f;
|
||||
ret += c > 0x07ff;
|
||||
ret += c > 0xffff;
|
||||
neat(err);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* From RFC 3629, Section 3:
|
||||
*
|
||||
* Char. number range | UTF-8 octet sequence
|
||||
* (hexadecimal) | (binary)
|
||||
* --------------------+---------------------------------------------
|
||||
* 0000 0000-0000 007F | 0xxxxxxx
|
||||
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
|
||||
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
|
||||
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*
|
||||
* See <https://datatracker.ietf.org/doc/html/rfc3629#section-3>
|
||||
*/
|
||||
|
||||
usize utf8_from_nchr(char *restrict dest, nchar c, error *err)
|
||||
{
|
||||
static const char prefixes[] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0 };
|
||||
|
||||
usize utf8_size = utf8_nchr_size(c, err);
|
||||
catch(err) {
|
||||
*dest = '\0';
|
||||
return 0;
|
||||
}
|
||||
|
||||
dest += utf8_size;
|
||||
*dest-- = '\0';
|
||||
|
||||
switch (utf8_size) {
|
||||
case 4:
|
||||
*dest-- = (char)( 0x80 | (c & 0x3f) );
|
||||
c >>= 6;
|
||||
/* fall through */
|
||||
case 3:
|
||||
*dest-- = (char)( 0x80 | (c & 0x3f) );
|
||||
c >>= 6;
|
||||
/* fall through */
|
||||
case 2:
|
||||
*dest-- = (char)( 0x80 | (c & 0x3f) );
|
||||
c >>= 6;
|
||||
/* fall through */
|
||||
case 1:
|
||||
/*
|
||||
* we don't need a bitmask for c here because utf8_nchr_size
|
||||
* already did the validation work for us so we know that c
|
||||
* doesn't have any upper bits it shouldn't
|
||||
*/
|
||||
*dest = (char)( prefixes[utf8_size] | c );
|
||||
break;
|
||||
}
|
||||
|
||||
return utf8_size;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: This (almost) branchless implementation is all fancy and shit, but it
|
||||
* may read up to two bytes beyond the memory area allocated for the input
|
||||
* buffer if it is passed a malformed code sequence. Someone (hopefully
|
||||
* not me lmao) should decide whether potentially overreading 3 bytes is
|
||||
* worth the considerable speed gain from this design.
|
||||
*/
|
||||
|
||||
usize utf8_to_nchr(nchar *dest, const char *restrict utf8chr, error *err)
|
||||
{
|
||||
/* Expected sequence length per the 5 MSBs of the start byte */
|
||||
static const uint_fast8_t lengths[] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0,
|
||||
};
|
||||
/* Payload bitmask for first byte in the sequence per sequence length */
|
||||
static const char cmasks[] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 };
|
||||
/* Minimum Unicode values per sequence length */
|
||||
static const nchar mins[] = { 0xffffffff, 0, 0x80, 0x800, 0x10000 };
|
||||
/* Error bitmasks for (unused) bytes 2-4 per sequence length */
|
||||
static const uint_fast8_t emasks[] = { 0x00, 0x03, 0x07, 0x3f, 0xff };
|
||||
|
||||
uint_fast8_t cshift = 0;
|
||||
/*
|
||||
* 0xff bitmask just in case we are on a really odd
|
||||
* architecture where char is more than one byte
|
||||
*/
|
||||
uint_fast8_t len = lengths[(utf8chr[0] >> 3) & 0xff];
|
||||
/*
|
||||
* 7-6: two MSBs of the fourth byte in the sequence, must be 0b10
|
||||
* 5-4: two MSBs of the third byte in the sequence, must be 0b10
|
||||
* 3-2: two MSBs of the second byte in the sequence, must be 0b10
|
||||
* 1: whether the start byte is invalid
|
||||
* 0: whether non-canonical encoding is used
|
||||
*/
|
||||
uint_fast8_t eflags = 0;
|
||||
nchar c = '\0';
|
||||
|
||||
switch (len) {
|
||||
case 4:
|
||||
c = (nchar)(utf8chr[3] & 0x3f); /* 10xx xxxx */
|
||||
cshift += 6;
|
||||
eflags |= utf8chr[3];
|
||||
/* fall through */
|
||||
case 3:
|
||||
c |= (nchar)(utf8chr[2] & 0x3f) << cshift; /* 10xx xxxx */
|
||||
cshift += 6;
|
||||
eflags |= (utf8chr[2] & 0xc0) >> 2;
|
||||
/* fall through */
|
||||
case 2:
|
||||
c |= (nchar)(utf8chr[1] & 0x3f) << cshift; /* 10xx xxxx */
|
||||
cshift += 6;
|
||||
eflags |= (utf8chr[1] & 0xc0) >> 4;
|
||||
/* fall through */
|
||||
case 1:
|
||||
c |= (nchar)(utf8chr[0] & cmasks[len]) << cshift;
|
||||
break;
|
||||
case 0:
|
||||
eflags |= 0x02;
|
||||
break;
|
||||
}
|
||||
|
||||
/* UTF-8 mandates each char be stored in as few bytes as possible */
|
||||
eflags = c < mins[len];
|
||||
|
||||
/*
|
||||
* Bytes 7-2 in eflags store the respective 2 MSBs of each tail byte
|
||||
* in the sequence which must all start with 0b10, therefore eflags
|
||||
* should be 0b101010xx if we have a four byte sequence. Toggling the
|
||||
* bits we expect to be one (0xa8) zeores bytes 7-2 if they are correct,
|
||||
* then we just need to mask out the bits that are unused if the
|
||||
* sequence is less than 4 bytes (emasks).
|
||||
*/
|
||||
eflags ^= 0xa8;
|
||||
eflags &= emasks[len];
|
||||
|
||||
if (eflags != 0) {
|
||||
/*
|
||||
* Errors are expected to be rare, so it's okay to use a bunch
|
||||
* of if statements in favor of accurate error descriptions
|
||||
* (and yeet is slow af anyway because it uses vsnprintf)
|
||||
*/
|
||||
*dest = '\0';
|
||||
if ((eflags & 0x01) != 0) {
|
||||
yeet(err, EINVAL,
|
||||
"Non canonical UTF-8 encoding: %lu byte character stored in %u bytes",
|
||||
utf8_nchr_size(c, nil), len);
|
||||
} else if ((eflags & 0x02) != 0) {
|
||||
yeet(err, EINVAL, "Illegal UTF-8 sequence start byte: 0x%02x", utf8chr[0]);
|
||||
} else if ((eflags & 0x0c) != 0) {
|
||||
yeet(err, EINVAL, "Byte 2 in UTF-8 sequence invalid: 0x%02x", utf8chr[1]);
|
||||
} else if ((eflags & 0x30) != 0) {
|
||||
yeet(err, EINVAL, "Byte 3 in UTF-8 sequence invalid: 0x%02x", utf8chr[2]);
|
||||
} else if ((eflags & 0xc0) != 0) {
|
||||
yeet(err, EINVAL, "Byte 4 in UTF-8 sequence invalid: 0x%02x", utf8chr[3]);
|
||||
} else {
|
||||
yeet(err, EINVAL, "Unexpected decoding error");
|
||||
}
|
||||
} else {
|
||||
*dest = c;
|
||||
neat(err);
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/*
|
||||
* This file is part of libneo.
|
||||
* Copyright (c) 2021 Fefie <owo@fef.moe>.
|
||||
*
|
||||
* libneo is non-violent software: you may only use, redistribute,
|
||||
* and/or modify it under the terms of the CNPLv6+ as found in
|
||||
* the LICENSE file in the source code root directory or at
|
||||
* <https://git.pixie.town/thufie/CNPL>.
|
||||
*
|
||||
* libneo comes with ABSOLUTELY NO WARRANTY, to the extent
|
||||
* permitted by applicable law. See the CNPLv6+ for details.
|
||||
*/
|
Loading…
Reference in a new issue