Some simple stdlib tests.

2026-02-27 12:01:16 +00:00 · 2022-11-20 16:38:18 +01:00
parent 2fefed5bda
commit 285299dcd5
3 changed files with 104 additions and 8 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -205,6 +205,11 @@ jobs:
          ../build/c3c compile-run examples/fannkuch-redux.c3
          ../build/c3c compile-run examples/contextfree/boolerr.c3

+      - name: Compile run stdlib tests
+        run: |
+          cd test
+          ../build/c3c compile-test stdlib/conv_tests.c3 -g1 --safe
+
      - name: Build testproject
        run: |
          cd resources/testproject
@@ -271,6 +276,11 @@ jobs:
          ../build/c3c compile-run examples/fannkuch-redux.c3
          ../build/c3c compile-run examples/contextfree/boolerr.c3

+      - name: Compile run stdlib tests
+        run: |
+          cd test
+          ../build/c3c compile-test stdlib/conv_tests.c3 -g1 --safe
+
      - name: Build testproject
        run: |
          cd resources/testproject
--- a/lib/std/core/conv.c3
+++ b/lib/std/core/conv.c3
@@ -19,27 +19,30 @@ fn usz! char32_to_utf8(Char32 c, char* output, usz available)
 	if (!available) return UnicodeResult.CONVERSION_FAILED!;
 	switch (true)
 	{
-		case c < 0x7f:
+		case c <= 0x7f:
 			output[0] = (char)c;
 			return 1;
-		case c < 0x7ff:
+		case c <= 0x7ff:
 	        if (available < 2) return UnicodeResult.CONVERSION_FAILED!;
 			output[0] = (char)(0xC0 | c >> 6);
            output[1] = (char)(0x80 | (c & 0x3F));
            return 2;
-        case c < 0xffff:
+        case c <= 0xffff:
 	        if (available < 3) return UnicodeResult.CONVERSION_FAILED!;
            output[0] = (char)(0xE0 | c >> 12);
            output[1] = (char)(0x80 | (c >> 6 & 0x3F));
            output[2] = (char)(0x80 | (c & 0x3F));
            return 3;
-        default:
+        case c <= 0x10ffff:
 	        if (available < 4) return UnicodeResult.CONVERSION_FAILED!;
 		    output[0] = (char)(0xF0 | c >> 18);
            output[1] = (char)(0x80 | (c >> 12 & 0x3F));
            output[2] = (char)(0x80 | (c >> 6 & 0x3F));
            output[3] = (char)(0x80 | (c & 0x3F));
            return 4;
+        default:
+            // 0x10FFFF and above is not defined.
+            return UnicodeResult.CONVERSION_FAILED!;
 	}
 }

@@ -144,8 +147,9 @@ fn Char32! utf8_to_char32(char* ptr, usz* size)
 		if (max_size < 2) return UnicodeResult.INVALID_UTF8!;
        *size = 2;
        Char32 uc = (c & 0x1F) << 6;
-		c = *ptr;
-		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+        c = *ptr;
+        // Overlong sequence or invalid second.
+		if (!uc || c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
 		return uc + c & 0x3F;
    }
    if ((c & 0xF0) == 0xE0)
@@ -157,10 +161,12 @@ fn Char32! utf8_to_char32(char* ptr, usz* size)
 		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
 		uc += (c & 0x3F) << 6;
 		c = ptr++[0];
-		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+		// Overlong sequence or invalid last
+		if (!uc || c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
 		return uc + c & 0x3F;
    }
    if (max_size < 4) return UnicodeResult.INVALID_UTF8!;
+    if ((c & 0xF8) != 0xF0) return UnicodeResult.INVALID_UTF8!;
    *size = 4;
    Char32 uc = (c & 0x07) << 18;
 	c = ptr++[0];
@@ -170,7 +176,8 @@ fn Char32! utf8_to_char32(char* ptr, usz* size)
 	if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
 	uc += (c & 0x3F) << 6;
 	c = ptr++[0];
-	if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+	// Overlong sequence or invalid last
+	if (!uc || c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
 	return uc + c & 0x3F;
 }

--- a/test/stdlib/conv_tests.c3
+++ b/test/stdlib/conv_tests.c3
@@ -0,0 +1,79 @@
+module conv_tests;
+import std::io;
+
+fn void! comparison_helper_32_to_8(Char32 c32, char[] expected_output)
+{
+	char[8] out;
+	usz len = conv::char32_to_utf8(c32, &out, 4)?;
+	assert(len == expected_output.len, "Len should be 1");
+	foreach (i, c : expected_output)
+	{
+		assert(out[i] == c, "Expected other value");
+	}
+}
+
+fn void! comparison_helper_8_to_32(char[] in, Char32 c32)
+{
+	usz len = in.len;
+	Char32 res = conv::utf8_to_char32(in.ptr, &len)?;
+	assert(len == in.len, "All len should be used.");
+	assert(res == c32, "Expected character match.");
+}
+
+fn void assert_utf8_is_error(char[] in)
+{
+	usz len = in.len;
+	assert(catch(conv::utf8_to_char32(in.ptr, &len)), "Expected error");
+}
+
+fn void! test_char32_ut8_boundary() @test
+{
+	// First sequence per len
+	comparison_helper_32_to_8(0x00000000, { 0 })?;
+	comparison_helper_32_to_8(0x00000080, { 0xc2, 0x80 })?;
+	comparison_helper_32_to_8(0x00000800, { 0xe0, 0xa0, 0x80 })?;
+	comparison_helper_32_to_8(0x00010000, { 0xf0, 0x90, 0x80, 0x80 })?;
+	assert(catch(comparison_helper_32_to_8(0x10ffff + 1, { 0 })), "Expected error");
+	// Last seq per len
+	comparison_helper_32_to_8(0x0000007f, { 0x7f })?;
+	comparison_helper_32_to_8(0x000007ff, { 0xdf, 0xbf })?;
+	comparison_helper_32_to_8(0x0000ffff, { 0xef, 0xbf, 0xbf })?;
+	comparison_helper_32_to_8(0x0010ffff, { 0xf4, 0x8f, 0xbf, 0xbf })?;
+	// Other boundaries
+	comparison_helper_32_to_8(0x0000d7ff, { 0xed, 0x9f, 0xbf})?;
+	comparison_helper_32_to_8(0x0000e000, { 0xee, 0x80, 0x80 })?;
+	comparison_helper_32_to_8(0x0000fffd, { 0xef, 0xbf, 0xbd })?;
+
+}
+
+fn void! test_utf8_to_char32_boundary() @test
+{
+	// First sequence per len
+	comparison_helper_8_to_32("\0", 0x0 )?;
+	comparison_helper_8_to_32({ 0xc2, 0x80 }, 0x80)?;
+	comparison_helper_8_to_32({ 0xe0, 0xa0, 0x80 }, 0x800, )?;
+	comparison_helper_8_to_32({ 0xf0, 0x90, 0x80, 0x80 }, 0x10000)?;
+	// Last seq per len
+	comparison_helper_8_to_32({ 0x7f }, 0x7f)?;
+	comparison_helper_8_to_32({ 0xdf, 0xbf }, 0x7ff, )?;
+	comparison_helper_8_to_32({ 0xef, 0xbf, 0xbf }, 0xffff)?;
+	comparison_helper_8_to_32({ 0xf4, 0x8f, 0xbf, 0xbf }, 0x10ffff)?;
+	// Other boundaries
+	comparison_helper_8_to_32({ 0xed, 0x9f, 0xbf }, 0xd7ff)?;
+	comparison_helper_8_to_32({ 0xee, 0x80, 0x80 }, 0xe000)?;
+	comparison_helper_8_to_32({ 0xef, 0xbf, 0xbd }, 0xfffd)?;
+
+	assert_utf8_is_error({ 0x80 });
+	assert_utf8_is_error({ 0xbf });
+	assert_utf8_is_error({ 0xfe });
+	assert_utf8_is_error({ 0xff });
+	assert_utf8_is_error({ 0xfe, 0xfe, 0xff, 0xff });
+
+	// Overlong
+	assert_utf8_is_error({ 0xc0, 0xaf });
+	assert_utf8_is_error({ 0xe0, 0x80, 0xaf });
+	assert_utf8_is_error({ 0xf0, 0x80, 0x80, 0xaf });
+	assert_utf8_is_error({ 0xf8, 0x80, 0x80, 0xaf });
+	assert_utf8_is_error({ 0xfc, 0x80, 0x80, 0x80, 0xaf });
+}
+