From 7169a434bf1046e342f38e983010241d269291de Mon Sep 17 00:00:00 2001 From: fef Date: Fri, 16 Jul 2021 13:59:11 +0200 Subject: [PATCH] utf: add utf8_ncheck --- include/neo/utf.h | 16 ++++ src/string/nstr.c | 2 +- src/string/utf.c | 18 +++++ test/string/utf/utf.cmake | 1 + test/string/utf/utf8_ncheck.cpp | 138 ++++++++++++++++++++++++++++++++ 5 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 test/string/utf/utf8_ncheck.cpp diff --git a/include/neo/utf.h b/include/neo/utf.h index 7444ace..efb39bc 100644 --- a/include/neo/utf.h +++ b/include/neo/utf.h @@ -35,6 +35,22 @@ extern "C" { */ usize utf8_check(const char *__restrict s, error *err); +/** + * Check whether a NUL terminated string is valid UTF-8, but read at most + * `maxsize + 3` bytes (this function uses `utf8_to_nchr` internally). + * + * If a NUL terminator is encountered before `maxsize` bytes, reading stops + * before the specified size. If the string contains any malformed code + * sequences, an error is yeeted. + * + * @param s: String to validate + * @param maxsize: Maximum amount of byte to read from `s` + * @param err: Error pointer + * @returns The number of UTF-8 code points (i.e. number of Unicode characters) + * excluding the terminating NUL byte; undefined on error + */ +usize utf8_ncheck(const char *__restrict s, usize maxsize, error *err); + /** * Compute the length of a raw UTF-8 encoded, NUL terminated string. * diff --git a/src/string/nstr.c b/src/string/nstr.c index 488035a..2b0b69c 100644 --- a/src/string/nstr.c +++ b/src/string/nstr.c @@ -21,7 +21,7 @@ static void nstr_destroy(string *str) static string *nstr_unsafe(const char *restrict s, usize size_without_nul, error *err) { - usize len = utf8_check(s, err); + usize len = utf8_ncheck(s, size_without_nul, err); catch(err) { return nil; } diff --git a/src/string/utf.c b/src/string/utf.c index b0d3861..e5d7ce6 100644 --- a/src/string/utf.c +++ b/src/string/utf.c @@ -30,6 +30,24 @@ usize utf8_check(const char *restrict s, error *err) return ret; } +usize utf8_ncheck(const char *restrict s, usize maxsize, error *err) +{ + usize ret = 0; + nchar c; + + while (*s != '\0' && maxsize != 0) { + ret++; + usize size = utf8_to_nchr(&c, s, err); + s += size; + maxsize -= size; + catch(err) { + break; + } + } + + return ret; +} + usize utf8_strlen(const char *restrict s) { usize len = 0; diff --git a/test/string/utf/utf.cmake b/test/string/utf/utf.cmake index 26c2b41..c3f5c0d 100644 --- a/test/string/utf/utf.cmake +++ b/test/string/utf/utf.cmake @@ -4,6 +4,7 @@ target_sources(neo_test PRIVATE string/utf/utf8_check.cpp string/utf/utf8_chrsize.cpp string/utf/utf8_from_nchr.cpp + string/utf/utf8_ncheck.cpp string/utf/utf8_strlen.cpp string/utf/utf8_to_nchr.cpp ) diff --git a/test/string/utf/utf8_ncheck.cpp b/test/string/utf/utf8_ncheck.cpp new file mode 100644 index 0000000..ba60b45 --- /dev/null +++ b/test/string/utf/utf8_ncheck.cpp @@ -0,0 +1,138 @@ +/** See the end of this file for copyright and license terms. */ + +#include +#include + +#include +#include + +TEST_CASE( "utf8_ncheck: ASCII string", "[string/utf.c]" ) +{ + error err; + usize len = utf8_ncheck("i'm gay,,,", 10, &err); + + REQUIRE( len == 10 ); + REQUIRE( errnum(&err) == 0 ); + REQUIRE( errmsg(&err) == nil ); +} + +TEST_CASE( "utf8_ncheck: String with 2-byte UTF-8 sequence", "[string/utf.c]" ) +{ + error err; + /* U+03B1 Greek Smol Letter Alpha */ + usize len = utf8_ncheck("i'm g\xce\xb1y,,,", 11, &err); + + REQUIRE( len == 10 ); + REQUIRE( errnum(&err) == 0 ); + REQUIRE( errmsg(&err) == nil ); +} + +TEST_CASE( "utf8_ncheck: String with 3-byte UTF-8 sequence", "[string/utf.c]" ) +{ + error err; + /* U+3042 Hiragana Letter A */ + usize len = utf8_ncheck("i'm g\xe3\x81\x82y,,,", 12, &err); + + REQUIRE( len == 10 ); + REQUIRE( errnum(&err) == 0 ); + REQUIRE( errmsg(&err) == nil ); +} + +TEST_CASE( "utf8_ncheck: String with 4-byte UTF-8 sequence", "[string/utf.c]" ) +{ + error err; + /* U+1F97A The Bottom Emoji(TM) */ + usize len = utf8_ncheck("i'm gay\xf0\x9f\xa5\xba,,,", 14, &err); + + REQUIRE( len == 11 ); + REQUIRE( errnum(&err) == 0 ); + REQUIRE( errmsg(&err) == nil ); +} + +TEST_CASE( "utf8_ncheck: Don't overread", "[string/utf.c]" ) +{ + error err; + usize len = utf8_ncheck("i'm gay,,,", 8, &err); + + REQUIRE( len == 8 ); + REQUIRE( errnum(&err) == 0 ); + REQUIRE( errmsg(&err) == nil ); +} + +TEST_CASE( "utf8_ncheck: Error on malformed sequence start", "[string/utf.c]" ) +{ + error err; + utf8_ncheck("\xff", 1, &err); + + string *expected = nstr("Illegal UTF-8 sequence start byte: 0xff", nil); + string *actual = errmsg(&err); + + REQUIRE( errnum(&err) == EINVAL ); + REQUIRE( nstreq(expected, actual, nil) ); + errput(&err); +} + +TEST_CASE( "utf8_ncheck: Error on wrong second byte", "[string/utf.c]" ) +{ + error err; + utf8_ncheck("\xce\xff", 2, &err); + + string *expected = nstr("Byte 2 in UTF-8 sequence invalid: 0xff", nil); + string *actual = errmsg(&err); + + REQUIRE( errnum(&err) == EINVAL ); + REQUIRE( nstreq(expected, actual, nil) ); + errput(&err); +} + +TEST_CASE( "utf8_ncheck: Error on wrong third byte", "[string/utf.c]" ) +{ + error err; + utf8_ncheck("\xe3\x81\xff", 3, &err); + + string *expected = nstr("Byte 3 in UTF-8 sequence invalid: 0xff", nil); + string *actual = errmsg(&err); + + REQUIRE( errnum(&err) == EINVAL ); + REQUIRE( nstreq(expected, actual, nil) ); + errput(&err); +} + +TEST_CASE( "utf8_ncheck: Error on wrong fourth byte", "[string/utf.c]" ) +{ + error err; + utf8_ncheck("\xf0\x9f\xa5\xff", 4, &err); + + string *expected = nstr("Byte 4 in UTF-8 sequence invalid: 0xff", nil); + string *actual = errmsg(&err); + + REQUIRE( errnum(&err) == EINVAL ); + REQUIRE( nstreq(expected, actual, nil) ); + errput(&err); +} + +TEST_CASE( "utf8_ncheck: Error on non canonical encoding", "[string/utf.c]" ) +{ + error err; + utf8_ncheck("\xf0\x80\x80\xa0", 4, &err); + + string *expected = nstr("Non canonical UTF-8 encoding: 1 byte character stored in 4 bytes", nil); + string *actual = errmsg(&err); + + REQUIRE( errnum(&err) == EINVAL ); + REQUIRE( nstreq(expected, actual, nil) ); + errput(&err); +} + +/* + * This file is part of libneo. + * Copyright (c) 2021 Fefie . + * + * libneo is non-violent software: you may only use, redistribute, + * and/or modify it under the terms of the CNPLv6+ as found in + * the LICENSE file in the source code root directory or at + * . + * + * libneo comes with ABSOLUTELY NO WARRANTY, to the extent + * permitted by applicable law. See the CNPLv6+ for details. + */