From 7169a434bf1046e342f38e983010241d269291de Mon Sep 17 00:00:00 2001
From: fef <owo@fef.moe>
Date: Fri, 16 Jul 2021 13:59:11 +0200
Subject: [PATCH] utf: add utf8_ncheck

---
 include/neo/utf.h               |  16 ++++
 src/string/nstr.c               |   2 +-
 src/string/utf.c                |  18 +++++
 test/string/utf/utf.cmake       |   1 +
 test/string/utf/utf8_ncheck.cpp | 138 ++++++++++++++++++++++++++++++++
 5 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 test/string/utf/utf8_ncheck.cpp

diff --git a/include/neo/utf.h b/include/neo/utf.h
index 7444ace..efb39bc 100644
--- a/include/neo/utf.h
+++ b/include/neo/utf.h
@@ -35,6 +35,22 @@ extern "C" {
  */
 usize utf8_check(const char *__restrict s, error *err);
 
+/**
+ * Check whether a NUL terminated string is valid UTF-8, but read at most
+ * `maxsize + 3` bytes (this function uses `utf8_to_nchr` internally).
+ *
+ * If a NUL terminator is encountered before `maxsize` bytes, reading stops
+ * before the specified size.  If the string contains any malformed code
+ * sequences, an error is yeeted.
+ *
+ * @param s: String to validate
+ * @param maxsize: Maximum amount of byte to read from `s`
+ * @param err: Error pointer
+ * @returns The number of UTF-8 code points (i.e. number of Unicode characters)
+ *	excluding the terminating NUL byte; undefined on error
+ */
+usize utf8_ncheck(const char *__restrict s, usize maxsize, error *err);
+
 /**
  * Compute the length of a raw UTF-8 encoded, NUL terminated string.
  *
diff --git a/src/string/nstr.c b/src/string/nstr.c
index 488035a..2b0b69c 100644
--- a/src/string/nstr.c
+++ b/src/string/nstr.c
@@ -21,7 +21,7 @@ static void nstr_destroy(string *str)
 
 static string *nstr_unsafe(const char *restrict s, usize size_without_nul, error *err)
 {
-	usize len = utf8_check(s, err);
+	usize len = utf8_ncheck(s, size_without_nul, err);
 	catch(err) {
 		return nil;
 	}
diff --git a/src/string/utf.c b/src/string/utf.c
index b0d3861..e5d7ce6 100644
--- a/src/string/utf.c
+++ b/src/string/utf.c
@@ -30,6 +30,24 @@ usize utf8_check(const char *restrict s, error *err)
 	return ret;
 }
 
+usize utf8_ncheck(const char *restrict s, usize maxsize, error *err)
+{
+	usize ret = 0;
+	nchar c;
+
+	while (*s != '\0' && maxsize != 0) {
+		ret++;
+		usize size = utf8_to_nchr(&c, s, err);
+		s += size;
+		maxsize -= size;
+		catch(err) {
+			break;
+		}
+	}
+
+	return ret;
+}
+
 usize utf8_strlen(const char *restrict s)
 {
 	usize len = 0;
diff --git a/test/string/utf/utf.cmake b/test/string/utf/utf.cmake
index 26c2b41..c3f5c0d 100644
--- a/test/string/utf/utf.cmake
+++ b/test/string/utf/utf.cmake
@@ -4,6 +4,7 @@ target_sources(neo_test PRIVATE
     string/utf/utf8_check.cpp
     string/utf/utf8_chrsize.cpp
     string/utf/utf8_from_nchr.cpp
+    string/utf/utf8_ncheck.cpp
     string/utf/utf8_strlen.cpp
     string/utf/utf8_to_nchr.cpp
 )
diff --git a/test/string/utf/utf8_ncheck.cpp b/test/string/utf/utf8_ncheck.cpp
new file mode 100644
index 0000000..ba60b45
--- /dev/null
+++ b/test/string/utf/utf8_ncheck.cpp
@@ -0,0 +1,138 @@
+/** See the end of this file for copyright and license terms. */
+
+#include <catch2/catch.hpp>
+#include <errno.h>
+
+#include <neo.h>
+#include <neo/utf.h>
+
+TEST_CASE( "utf8_ncheck: ASCII string", "[string/utf.c]" )
+{
+	error err;
+	usize len = utf8_ncheck("i'm gay,,,", 10, &err);
+
+	REQUIRE( len == 10 );
+	REQUIRE( errnum(&err) == 0 );
+	REQUIRE( errmsg(&err) == nil );
+}
+
+TEST_CASE( "utf8_ncheck: String with 2-byte UTF-8 sequence", "[string/utf.c]" )
+{
+	error err;
+	/* U+03B1 Greek Smol Letter Alpha */
+	usize len = utf8_ncheck("i'm g\xce\xb1y,,,", 11, &err);
+
+	REQUIRE( len == 10 );
+	REQUIRE( errnum(&err) == 0 );
+	REQUIRE( errmsg(&err) == nil );
+}
+
+TEST_CASE( "utf8_ncheck: String with 3-byte UTF-8 sequence", "[string/utf.c]" )
+{
+	error err;
+	/* U+3042 Hiragana Letter A */
+	usize len = utf8_ncheck("i'm g\xe3\x81\x82y,,,", 12, &err);
+
+	REQUIRE( len == 10 );
+	REQUIRE( errnum(&err) == 0 );
+	REQUIRE( errmsg(&err) == nil );
+}
+
+TEST_CASE( "utf8_ncheck: String with 4-byte UTF-8 sequence", "[string/utf.c]" )
+{
+	error err;
+	/* U+1F97A The Bottom Emoji(TM) */
+	usize len = utf8_ncheck("i'm gay\xf0\x9f\xa5\xba,,,", 14, &err);
+
+	REQUIRE( len == 11 );
+	REQUIRE( errnum(&err) == 0 );
+	REQUIRE( errmsg(&err) == nil );
+}
+
+TEST_CASE( "utf8_ncheck: Don't overread", "[string/utf.c]" )
+{
+	error err;
+	usize len = utf8_ncheck("i'm gay,,,", 8, &err);
+
+	REQUIRE( len == 8 );
+	REQUIRE( errnum(&err) == 0 );
+	REQUIRE( errmsg(&err) == nil );
+}
+
+TEST_CASE( "utf8_ncheck: Error on malformed sequence start", "[string/utf.c]" )
+{
+	error err;
+	utf8_ncheck("\xff", 1, &err);
+
+	string *expected = nstr("Illegal UTF-8 sequence start byte: 0xff", nil);
+	string *actual = errmsg(&err);
+
+	REQUIRE( errnum(&err) == EINVAL );
+	REQUIRE( nstreq(expected, actual, nil) );
+	errput(&err);
+}
+
+TEST_CASE( "utf8_ncheck: Error on wrong second byte", "[string/utf.c]" )
+{
+	error err;
+	utf8_ncheck("\xce\xff", 2, &err);
+
+	string *expected = nstr("Byte 2 in UTF-8 sequence invalid: 0xff", nil);
+	string *actual = errmsg(&err);
+
+	REQUIRE( errnum(&err) == EINVAL );
+	REQUIRE( nstreq(expected, actual, nil) );
+	errput(&err);
+}
+
+TEST_CASE( "utf8_ncheck: Error on wrong third byte", "[string/utf.c]" )
+{
+	error err;
+	utf8_ncheck("\xe3\x81\xff", 3, &err);
+
+	string *expected = nstr("Byte 3 in UTF-8 sequence invalid: 0xff", nil);
+	string *actual = errmsg(&err);
+
+	REQUIRE( errnum(&err) == EINVAL );
+	REQUIRE( nstreq(expected, actual, nil) );
+	errput(&err);
+}
+
+TEST_CASE( "utf8_ncheck: Error on wrong fourth byte", "[string/utf.c]" )
+{
+	error err;
+	utf8_ncheck("\xf0\x9f\xa5\xff", 4, &err);
+
+	string *expected = nstr("Byte 4 in UTF-8 sequence invalid: 0xff", nil);
+	string *actual = errmsg(&err);
+
+	REQUIRE( errnum(&err) == EINVAL );
+	REQUIRE( nstreq(expected, actual, nil) );
+	errput(&err);
+}
+
+TEST_CASE( "utf8_ncheck: Error on non canonical encoding", "[string/utf.c]" )
+{
+	error err;
+	utf8_ncheck("\xf0\x80\x80\xa0", 4, &err);
+
+	string *expected = nstr("Non canonical UTF-8 encoding: 1 byte character stored in 4 bytes", nil);
+	string *actual = errmsg(&err);
+
+	REQUIRE( errnum(&err) == EINVAL );
+	REQUIRE( nstreq(expected, actual, nil) );
+	errput(&err);
+}
+
+/*
+ * This file is part of libneo.
+ * Copyright (c) 2021 Fefie <owo@fef.moe>.
+ *
+ * libneo is non-violent software: you may only use, redistribute,
+ * and/or modify it under the terms of the CNPLv6+ as found in
+ * the LICENSE file in the source code root directory or at
+ * <https://git.pixie.town/thufie/CNPL>.
+ *
+ * libneo comes with ABSOLUTELY NO WARRANTY, to the extent
+ * permitted by applicable law.  See the CNPLv6+ for details.
+ */