You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

259 lines
6.9 KiB
C

/* See the end of this file for copyright and license terms. */
/*
* The UTF-8 conversion functions are based on the branchless UTF-8 decoder by
* Christopher Wellons, which is in the public domain. For the original, see
* <https://github.com/skeeto/branchless-utf8/blob/f2d0e24c3864d726cd009901726df4778ad3e0d5/utf8.h>
*/
#include <errno.h>
#include <stdint.h>
#include "neo/_error.h"
#include "neo/_nalloc.h"
#include "neo/_types.h"
#include "neo/utf.h"
usize utf8_check(const char *restrict s, error *err)
{
usize ret = 0;
nchar c = 0xffffffff; /* not a valid Unicode character */
while (*s != '\0') {
ret++;
s += utf8_to_nchr(&c, s, err);
catch(err) {
break;
}
}
if (c == 0xffffffff) /* loop hasn't executed at all */
neat(err);
return ret;
}
usize utf8_ncheck(const char *restrict s, usize maxsize, error *err)
{
usize ret = 0;
nchar c = 0xffffffff; /* not a valid Unicode character */
while (*s != '\0' && maxsize != 0) {
ret++;
usize size = utf8_to_nchr(&c, s, err);
s += size;
maxsize -= size;
catch(err) {
break;
}
}
if (c == 0xffffffff) /* loop hasn't executed at all */
neat(err);
return ret;
}
usize utf8_strlen(const char *restrict s)
{
usize len = 0;
while (*s != '\0')
len += (*s++ & 0xc0) != 0x80;
return len;
}
usize utf8_chrsize(nchar c, error *err)
{
usize ret;
if (c > 0x0010ffff) {
ret = 0;
yeet(err, EINVAL, "Character code not within Unicode range");
} else {
ret = 1;
ret += c > 0x7f;
ret += c > 0x07ff;
ret += c > 0xffff;
neat(err);
}
return ret;
}
/*
* From RFC 3629, Section 3:
*
* Char. number range | UTF-8 octet sequence
* (hexadecimal) | (binary)
* --------------------+---------------------------------------------
* 0000 0000-0000 007F | 0xxxxxxx
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* See <https://datatracker.ietf.org/doc/html/rfc3629#section-3>
*/
usize utf8_from_nchr(char *restrict dest, nchar c, error *err)
{
static const u8 prefixes[] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0 };
usize utf8_size = utf8_chrsize(c, err);
catch(err) {
*dest = '\0';
return 0;
}
dest += utf8_size;
*dest-- = '\0';
switch (utf8_size) {
case 4:
*dest-- = (char)( 0x80 | (c & 0x3f) );
c >>= 6;
/* fall through */
case 3:
*dest-- = (char)( 0x80 | (c & 0x3f) );
c >>= 6;
/* fall through */
case 2:
*dest-- = (char)( 0x80 | (c & 0x3f) );
c >>= 6;
/* fall through */
case 1:
/*
* we don't need a bitmask for c here because utf8_nchr_size
* already did the validation work for us so we know that c
* doesn't have any upper bits it shouldn't
*/
*dest = (char)( prefixes[utf8_size] | c );
break;
}
return utf8_size;
}
/*
* TODO: This (almost) branchless implementation is all fancy and shit, but it
* may read up to two bytes beyond the memory area allocated for the input
* buffer if it is passed a malformed code sequence. Someone (hopefully
* not me lmao) should decide whether potentially overreading 3 bytes is
* worth the considerable speed gain from this design.
*/
usize utf8_to_nchr(nchar *dest, const char *restrict _utf8chr, error *err)
{
/* Expected sequence length per the 5 MSBs of the start byte */
static const uint_fast8_t lengths[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0,
};
/* Payload bitmask for first byte in the sequence per sequence length */
static const char cmasks[] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 };
/* Minimum Unicode values per sequence length */
static const nchar mins[] = { 0x00, 0x00, 0x80, 0x800, 0x10000 };
/* Error bitmasks for (unused) bytes 2-4 per sequence length */
static const uint_fast8_t emasks[] = { 0x03, 0x03, 0x07, 0x3f, 0xff };
/* signed bitshifts are a bad idea, trust me */
const u8 *restrict utf8chr = (const u8 *restrict)_utf8chr;
uint_fast8_t cshift = 0;
/*
* 0xff bitmask just in case we are on a really odd
* architecture where char is more than one byte
*/
uint_fast8_t len = lengths[utf8chr[0] >> 3];
/*
* 7-6: two MSBs of the fourth byte in the sequence, must be 0b10
* 5-4: two MSBs of the third byte in the sequence, must be 0b10
* 3-2: two MSBs of the second byte in the sequence, must be 0b10
* 1: whether the start byte is invalid
* 0: whether non-canonical encoding is used
*/
uint_fast8_t eflags = 0;
nchar c = '\0';
switch (len) {
case 4:
c = (nchar)(utf8chr[3] & 0x3f); /* 10xx xxxx */
cshift += 6;
eflags |= utf8chr[3] & 0xc0;
/* fall through */
case 3:
c |= (nchar)(utf8chr[2] & 0x3f) << cshift; /* 10xx xxxx */
cshift += 6;
eflags |= (utf8chr[2] & 0xc0) >> 2;
/* fall through */
case 2:
c |= (nchar)(utf8chr[1] & 0x3f) << cshift; /* 10xx xxxx */
cshift += 6;
eflags |= (utf8chr[1] & 0xc0) >> 4;
/* fall through */
case 1:
c |= (nchar)(utf8chr[0] & cmasks[len]) << cshift;
break;
case 0:
eflags |= 0x02;
break;
}
/* UTF-8 mandates each char be stored in as few bytes as possible */
eflags |= c < mins[len];
/*
* Bytes 7-2 in eflags store the respective 2 MSBs of each tail byte
* in the sequence which must all start with 0b10, therefore eflags
* should be 0b101010xx if we have a four byte sequence. Toggling the
* bits we expect to be one (0xa8) zeores bytes 7-2 if they are correct,
* then we just need to mask out the bits that are unused if the
* sequence is less than 4 bytes (emasks).
*/
eflags ^= 0xa8;
eflags &= emasks[len];
if (eflags != 0) {
/*
* Errors are expected to be rare, so it's okay to use a bunch
* of if statements in favor of accurate error descriptions
* (and yeet is slow af anyway because it uses vsnprintf)
*/
*dest = '\0';
if ((eflags & 0x01) != 0) {
yeet(err, EINVAL,
"Non canonical UTF-8 encoding: %zu byte character stored in %u bytes",
utf8_chrsize(c, nil), len);
} else if ((eflags & 0x02) != 0) {
yeet(err, EINVAL, "Illegal UTF-8 sequence start byte: 0x%02x", utf8chr[0]);
} else if ((eflags & 0x0c) != 0) {
yeet(err, EINVAL, "Byte 2 in UTF-8 sequence invalid: 0x%02x", utf8chr[1]);
} else if ((eflags & 0x30) != 0) {
yeet(err, EINVAL, "Byte 3 in UTF-8 sequence invalid: 0x%02x", utf8chr[2]);
} else if ((eflags & 0xc0) != 0) {
yeet(err, EINVAL, "Byte 4 in UTF-8 sequence invalid: 0x%02x", utf8chr[3]);
} else {
yeet(err, EINVAL, "Unexpected decoding error");
}
} else {
*dest = c;
neat(err);
}
return len;
}
/*
* This file is part of libneo.
* Copyright (c) 2021 Fefie <owo@fef.moe>.
*
* libneo is non-violent software: you may only use, redistribute,
* and/or modify it under the terms of the CNPLv6+ as found in
* the LICENSE file in the source code root directory or at
* <https://git.pixie.town/thufie/CNPL>.
*
* libneo comes with ABSOLUTELY NO WARRANTY, to the extent
* permitted by applicable law. See the CNPLv6+ for details.
*/