You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
259 lines
6.9 KiB
C
259 lines
6.9 KiB
C
/* See the end of this file for copyright and license terms. */
|
|
|
|
/*
|
|
* The UTF-8 conversion functions are based on the branchless UTF-8 decoder by
|
|
* Christopher Wellons, which is in the public domain. For the original, see
|
|
* <https://github.com/skeeto/branchless-utf8/blob/f2d0e24c3864d726cd009901726df4778ad3e0d5/utf8.h>
|
|
*/
|
|
|
|
#include <errno.h>
|
|
#include <stdint.h>
|
|
|
|
#include "neo/_error.h"
|
|
#include "neo/_nalloc.h"
|
|
#include "neo/_types.h"
|
|
#include "neo/utf.h"
|
|
|
|
usize utf8_check(const char *restrict s, error *err)
|
|
{
|
|
usize ret = 0;
|
|
nchar c = 0xffffffff; /* not a valid Unicode character */
|
|
|
|
while (*s != '\0') {
|
|
ret++;
|
|
s += utf8_to_nchr(&c, s, err);
|
|
catch(err) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (c == 0xffffffff) /* loop hasn't executed at all */
|
|
neat(err);
|
|
|
|
return ret;
|
|
}
|
|
|
|
usize utf8_ncheck(const char *restrict s, usize maxsize, error *err)
|
|
{
|
|
usize ret = 0;
|
|
nchar c = 0xffffffff; /* not a valid Unicode character */
|
|
|
|
while (*s != '\0' && maxsize != 0) {
|
|
ret++;
|
|
usize size = utf8_to_nchr(&c, s, err);
|
|
s += size;
|
|
maxsize -= size;
|
|
catch(err) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (c == 0xffffffff) /* loop hasn't executed at all */
|
|
neat(err);
|
|
|
|
return ret;
|
|
}
|
|
|
|
usize utf8_strlen(const char *restrict s)
|
|
{
|
|
usize len = 0;
|
|
|
|
while (*s != '\0')
|
|
len += (*s++ & 0xc0) != 0x80;
|
|
|
|
return len;
|
|
}
|
|
|
|
usize utf8_chrsize(nchar c, error *err)
|
|
{
|
|
usize ret;
|
|
|
|
if (c > 0x0010ffff) {
|
|
ret = 0;
|
|
yeet(err, EINVAL, "Character code not within Unicode range");
|
|
} else {
|
|
ret = 1;
|
|
ret += c > 0x7f;
|
|
ret += c > 0x07ff;
|
|
ret += c > 0xffff;
|
|
neat(err);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* From RFC 3629, Section 3:
|
|
*
|
|
* Char. number range | UTF-8 octet sequence
|
|
* (hexadecimal) | (binary)
|
|
* --------------------+---------------------------------------------
|
|
* 0000 0000-0000 007F | 0xxxxxxx
|
|
* 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
|
|
* 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
|
|
* 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*
|
|
* See <https://datatracker.ietf.org/doc/html/rfc3629#section-3>
|
|
*/
|
|
|
|
usize utf8_from_nchr(char *restrict dest, nchar c, error *err)
|
|
{
|
|
static const u8 prefixes[] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0 };
|
|
|
|
usize utf8_size = utf8_chrsize(c, err);
|
|
catch(err) {
|
|
*dest = '\0';
|
|
return 0;
|
|
}
|
|
|
|
dest += utf8_size;
|
|
*dest-- = '\0';
|
|
|
|
switch (utf8_size) {
|
|
case 4:
|
|
*dest-- = (char)( 0x80 | (c & 0x3f) );
|
|
c >>= 6;
|
|
/* fall through */
|
|
case 3:
|
|
*dest-- = (char)( 0x80 | (c & 0x3f) );
|
|
c >>= 6;
|
|
/* fall through */
|
|
case 2:
|
|
*dest-- = (char)( 0x80 | (c & 0x3f) );
|
|
c >>= 6;
|
|
/* fall through */
|
|
case 1:
|
|
/*
|
|
* we don't need a bitmask for c here because utf8_nchr_size
|
|
* already did the validation work for us so we know that c
|
|
* doesn't have any upper bits it shouldn't
|
|
*/
|
|
*dest = (char)( prefixes[utf8_size] | c );
|
|
break;
|
|
}
|
|
|
|
return utf8_size;
|
|
}
|
|
|
|
/*
|
|
* TODO: This (almost) branchless implementation is all fancy and shit, but it
|
|
* may read up to two bytes beyond the memory area allocated for the input
|
|
* buffer if it is passed a malformed code sequence. Someone (hopefully
|
|
* not me lmao) should decide whether potentially overreading 3 bytes is
|
|
* worth the considerable speed gain from this design.
|
|
*/
|
|
|
|
usize utf8_to_nchr(nchar *dest, const char *restrict _utf8chr, error *err)
|
|
{
|
|
/* Expected sequence length per the 5 MSBs of the start byte */
|
|
static const uint_fast8_t lengths[] = {
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0,
|
|
};
|
|
/* Payload bitmask for first byte in the sequence per sequence length */
|
|
static const char cmasks[] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07 };
|
|
/* Minimum Unicode values per sequence length */
|
|
static const nchar mins[] = { 0x00, 0x00, 0x80, 0x800, 0x10000 };
|
|
/* Error bitmasks for (unused) bytes 2-4 per sequence length */
|
|
static const uint_fast8_t emasks[] = { 0x03, 0x03, 0x07, 0x3f, 0xff };
|
|
|
|
/* signed bitshifts are a bad idea, trust me */
|
|
const u8 *restrict utf8chr = (const u8 *restrict)_utf8chr;
|
|
|
|
uint_fast8_t cshift = 0;
|
|
/*
|
|
* 0xff bitmask just in case we are on a really odd
|
|
* architecture where char is more than one byte
|
|
*/
|
|
uint_fast8_t len = lengths[utf8chr[0] >> 3];
|
|
/*
|
|
* 7-6: two MSBs of the fourth byte in the sequence, must be 0b10
|
|
* 5-4: two MSBs of the third byte in the sequence, must be 0b10
|
|
* 3-2: two MSBs of the second byte in the sequence, must be 0b10
|
|
* 1: whether the start byte is invalid
|
|
* 0: whether non-canonical encoding is used
|
|
*/
|
|
uint_fast8_t eflags = 0;
|
|
nchar c = '\0';
|
|
|
|
switch (len) {
|
|
case 4:
|
|
c = (nchar)(utf8chr[3] & 0x3f); /* 10xx xxxx */
|
|
cshift += 6;
|
|
eflags |= utf8chr[3] & 0xc0;
|
|
/* fall through */
|
|
case 3:
|
|
c |= (nchar)(utf8chr[2] & 0x3f) << cshift; /* 10xx xxxx */
|
|
cshift += 6;
|
|
eflags |= (utf8chr[2] & 0xc0) >> 2;
|
|
/* fall through */
|
|
case 2:
|
|
c |= (nchar)(utf8chr[1] & 0x3f) << cshift; /* 10xx xxxx */
|
|
cshift += 6;
|
|
eflags |= (utf8chr[1] & 0xc0) >> 4;
|
|
/* fall through */
|
|
case 1:
|
|
c |= (nchar)(utf8chr[0] & cmasks[len]) << cshift;
|
|
break;
|
|
case 0:
|
|
eflags |= 0x02;
|
|
break;
|
|
}
|
|
|
|
/* UTF-8 mandates each char be stored in as few bytes as possible */
|
|
eflags |= c < mins[len];
|
|
|
|
/*
|
|
* Bytes 7-2 in eflags store the respective 2 MSBs of each tail byte
|
|
* in the sequence which must all start with 0b10, therefore eflags
|
|
* should be 0b101010xx if we have a four byte sequence. Toggling the
|
|
* bits we expect to be one (0xa8) zeores bytes 7-2 if they are correct,
|
|
* then we just need to mask out the bits that are unused if the
|
|
* sequence is less than 4 bytes (emasks).
|
|
*/
|
|
eflags ^= 0xa8;
|
|
eflags &= emasks[len];
|
|
|
|
if (eflags != 0) {
|
|
/*
|
|
* Errors are expected to be rare, so it's okay to use a bunch
|
|
* of if statements in favor of accurate error descriptions
|
|
* (and yeet is slow af anyway because it uses vsnprintf)
|
|
*/
|
|
*dest = '\0';
|
|
if ((eflags & 0x01) != 0) {
|
|
yeet(err, EINVAL,
|
|
"Non canonical UTF-8 encoding: %zu byte character stored in %u bytes",
|
|
utf8_chrsize(c, nil), len);
|
|
} else if ((eflags & 0x02) != 0) {
|
|
yeet(err, EINVAL, "Illegal UTF-8 sequence start byte: 0x%02x", utf8chr[0]);
|
|
} else if ((eflags & 0x0c) != 0) {
|
|
yeet(err, EINVAL, "Byte 2 in UTF-8 sequence invalid: 0x%02x", utf8chr[1]);
|
|
} else if ((eflags & 0x30) != 0) {
|
|
yeet(err, EINVAL, "Byte 3 in UTF-8 sequence invalid: 0x%02x", utf8chr[2]);
|
|
} else if ((eflags & 0xc0) != 0) {
|
|
yeet(err, EINVAL, "Byte 4 in UTF-8 sequence invalid: 0x%02x", utf8chr[3]);
|
|
} else {
|
|
yeet(err, EINVAL, "Unexpected decoding error");
|
|
}
|
|
} else {
|
|
*dest = c;
|
|
neat(err);
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
/*
|
|
* This file is part of libneo.
|
|
* Copyright (c) 2021 Fefie <owo@fef.moe>.
|
|
*
|
|
* libneo is non-violent software: you may only use, redistribute,
|
|
* and/or modify it under the terms of the CNPLv6+ as found in
|
|
* the LICENSE file in the source code root directory or at
|
|
* <https://git.pixie.town/thufie/CNPL>.
|
|
*
|
|
* libneo comes with ABSOLUTELY NO WARRANTY, to the extent
|
|
* permitted by applicable law. See the CNPLv6+ for details.
|
|
*/
|