diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f19ffaf6c..ba8d99cf0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -205,6 +205,11 @@ jobs: ../build/c3c compile-run examples/fannkuch-redux.c3 ../build/c3c compile-run examples/contextfree/boolerr.c3 + - name: Compile run stdlib tests + run: | + cd test + ../build/c3c compile-test stdlib/conv_tests.c3 -g1 --safe + - name: Build testproject run: | cd resources/testproject @@ -271,6 +276,11 @@ jobs: ../build/c3c compile-run examples/fannkuch-redux.c3 ../build/c3c compile-run examples/contextfree/boolerr.c3 + - name: Compile run stdlib tests + run: | + cd test + ../build/c3c compile-test stdlib/conv_tests.c3 -g1 --safe + - name: Build testproject run: | cd resources/testproject diff --git a/lib/std/core/conv.c3 b/lib/std/core/conv.c3 index 2cab52169..625334674 100644 --- a/lib/std/core/conv.c3 +++ b/lib/std/core/conv.c3 @@ -19,27 +19,30 @@ fn usz! char32_to_utf8(Char32 c, char* output, usz available) if (!available) return UnicodeResult.CONVERSION_FAILED!; switch (true) { - case c < 0x7f: + case c <= 0x7f: output[0] = (char)c; return 1; - case c < 0x7ff: + case c <= 0x7ff: if (available < 2) return UnicodeResult.CONVERSION_FAILED!; output[0] = (char)(0xC0 | c >> 6); output[1] = (char)(0x80 | (c & 0x3F)); return 2; - case c < 0xffff: + case c <= 0xffff: if (available < 3) return UnicodeResult.CONVERSION_FAILED!; output[0] = (char)(0xE0 | c >> 12); output[1] = (char)(0x80 | (c >> 6 & 0x3F)); output[2] = (char)(0x80 | (c & 0x3F)); return 3; - default: + case c <= 0x10ffff: if (available < 4) return UnicodeResult.CONVERSION_FAILED!; output[0] = (char)(0xF0 | c >> 18); output[1] = (char)(0x80 | (c >> 12 & 0x3F)); output[2] = (char)(0x80 | (c >> 6 & 0x3F)); output[3] = (char)(0x80 | (c & 0x3F)); return 4; + default: + // 0x10FFFF and above is not defined. + return UnicodeResult.CONVERSION_FAILED!; } } @@ -144,8 +147,9 @@ fn Char32! utf8_to_char32(char* ptr, usz* size) if (max_size < 2) return UnicodeResult.INVALID_UTF8!; *size = 2; Char32 uc = (c & 0x1F) << 6; - c = *ptr; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + c = *ptr; + // Overlong sequence or invalid second. + if (!uc || c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; return uc + c & 0x3F; } if ((c & 0xF0) == 0xE0) @@ -157,10 +161,12 @@ fn Char32! utf8_to_char32(char* ptr, usz* size) if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; uc += (c & 0x3F) << 6; c = ptr++[0]; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + // Overlong sequence or invalid last + if (!uc || c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; return uc + c & 0x3F; } if (max_size < 4) return UnicodeResult.INVALID_UTF8!; + if ((c & 0xF8) != 0xF0) return UnicodeResult.INVALID_UTF8!; *size = 4; Char32 uc = (c & 0x07) << 18; c = ptr++[0]; @@ -170,7 +176,8 @@ fn Char32! utf8_to_char32(char* ptr, usz* size) if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; uc += (c & 0x3F) << 6; c = ptr++[0]; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + // Overlong sequence or invalid last + if (!uc || c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; return uc + c & 0x3F; } diff --git a/test/stdlib/conv_tests.c3 b/test/stdlib/conv_tests.c3 new file mode 100644 index 000000000..7fd916156 --- /dev/null +++ b/test/stdlib/conv_tests.c3 @@ -0,0 +1,79 @@ +module conv_tests; +import std::io; + +fn void! comparison_helper_32_to_8(Char32 c32, char[] expected_output) +{ + char[8] out; + usz len = conv::char32_to_utf8(c32, &out, 4)?; + assert(len == expected_output.len, "Len should be 1"); + foreach (i, c : expected_output) + { + assert(out[i] == c, "Expected other value"); + } +} + +fn void! comparison_helper_8_to_32(char[] in, Char32 c32) +{ + usz len = in.len; + Char32 res = conv::utf8_to_char32(in.ptr, &len)?; + assert(len == in.len, "All len should be used."); + assert(res == c32, "Expected character match."); +} + +fn void assert_utf8_is_error(char[] in) +{ + usz len = in.len; + assert(catch(conv::utf8_to_char32(in.ptr, &len)), "Expected error"); +} + +fn void! test_char32_ut8_boundary() @test +{ + // First sequence per len + comparison_helper_32_to_8(0x00000000, { 0 })?; + comparison_helper_32_to_8(0x00000080, { 0xc2, 0x80 })?; + comparison_helper_32_to_8(0x00000800, { 0xe0, 0xa0, 0x80 })?; + comparison_helper_32_to_8(0x00010000, { 0xf0, 0x90, 0x80, 0x80 })?; + assert(catch(comparison_helper_32_to_8(0x10ffff + 1, { 0 })), "Expected error"); + // Last seq per len + comparison_helper_32_to_8(0x0000007f, { 0x7f })?; + comparison_helper_32_to_8(0x000007ff, { 0xdf, 0xbf })?; + comparison_helper_32_to_8(0x0000ffff, { 0xef, 0xbf, 0xbf })?; + comparison_helper_32_to_8(0x0010ffff, { 0xf4, 0x8f, 0xbf, 0xbf })?; + // Other boundaries + comparison_helper_32_to_8(0x0000d7ff, { 0xed, 0x9f, 0xbf})?; + comparison_helper_32_to_8(0x0000e000, { 0xee, 0x80, 0x80 })?; + comparison_helper_32_to_8(0x0000fffd, { 0xef, 0xbf, 0xbd })?; + +} + +fn void! test_utf8_to_char32_boundary() @test +{ + // First sequence per len + comparison_helper_8_to_32("\0", 0x0 )?; + comparison_helper_8_to_32({ 0xc2, 0x80 }, 0x80)?; + comparison_helper_8_to_32({ 0xe0, 0xa0, 0x80 }, 0x800, )?; + comparison_helper_8_to_32({ 0xf0, 0x90, 0x80, 0x80 }, 0x10000)?; + // Last seq per len + comparison_helper_8_to_32({ 0x7f }, 0x7f)?; + comparison_helper_8_to_32({ 0xdf, 0xbf }, 0x7ff, )?; + comparison_helper_8_to_32({ 0xef, 0xbf, 0xbf }, 0xffff)?; + comparison_helper_8_to_32({ 0xf4, 0x8f, 0xbf, 0xbf }, 0x10ffff)?; + // Other boundaries + comparison_helper_8_to_32({ 0xed, 0x9f, 0xbf }, 0xd7ff)?; + comparison_helper_8_to_32({ 0xee, 0x80, 0x80 }, 0xe000)?; + comparison_helper_8_to_32({ 0xef, 0xbf, 0xbd }, 0xfffd)?; + + assert_utf8_is_error({ 0x80 }); + assert_utf8_is_error({ 0xbf }); + assert_utf8_is_error({ 0xfe }); + assert_utf8_is_error({ 0xff }); + assert_utf8_is_error({ 0xfe, 0xfe, 0xff, 0xff }); + + // Overlong + assert_utf8_is_error({ 0xc0, 0xaf }); + assert_utf8_is_error({ 0xe0, 0x80, 0xaf }); + assert_utf8_is_error({ 0xf0, 0x80, 0x80, 0xaf }); + assert_utf8_is_error({ 0xf8, 0x80, 0x80, 0xaf }); + assert_utf8_is_error({ 0xfc, 0x80, 0x80, 0x80, 0xaf }); +} +