codepage: add single-byte code page support (#2891)

* codepage: add single-byte code page support Add std::encoding::codepage with a shared engine for converting between single-byte code pages and UTF-8 using table-driven mappings. Introduce generated tables and wrappers for several code pages[1] each exposing encode/decode helpers built on a common CodePageTable structure. The mapping data is generated by cpgen[2] from the Unicode Consortium’s published code page mapping files and follows the Unicode standard’s interpretation of control characters (abstract characters) rather than historical VGA glyph shapes. [1] Code page overview/groups: DOS/OEM code pages (legacy PC): cp437 cp737 cp775 cp850 cp852 cp855 cp857 cp860 cp861 cp862 cp863 cp864 cp865 cp866 cp869 cp874 Windows code pages (ANSI/Windows): cp1250 cp1251 cp1252 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258 ISO/IEC 8859 series (Latin/Regional): iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5 iso_8859_6 iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11 iso_8859_13 iso_8859_14 iso_8859_15 iso_8859_16 [2] github.com/konimarti/cpgen Signed-off-by: Koni Marti <koni.marti@gmail.com> * codepage: change encoding format, streamline api * Use enum to collect the data. --------- Signed-off-by: Koni Marti <koni.marti@gmail.com> Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
2026-02-27 03:51:18 +00:00 · 2026-02-11 01:10:12 +01:00
parent a80e40a798
commit 40e6a2c4a3
4 changed files with 2570 additions and 0 deletions
--- a/test/unit/stdlib/encoding/codepage.c3
+++ b/test/unit/stdlib/encoding/codepage.c3
@@ -0,0 +1,92 @@
+// Copyright (c) 2026 Koni Marti. All rights reserved.
+// Use of this source code is governed by the MIT license.
+module std::encoding::codepage_test;
+import std::encoding::codepage;
+
+fn void test_cp437() @test
+{
+	String want = "╔══════════════════════════════════════╗"
+                      "║  SYSTEM STATUS: OK - 25°C ± 2°C      ║"
+                      "║  Café Menu: Crème Brûlée .... £5.00  ║"
+                      "╚══════════════════════════════════════╝";
+
+	char[] bytes =
+	x"C9CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBB"
+	x"BA202053595354454D205354415455533A204F4B202D203235F84320F12032F843202020202020BA"
+	x"BA202043616682204D656E753A2043728A6D65204272966C8265202E2E2E2E209C352E30302020BA"
+	x"C8CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBC";
+
+	@pool()
+	{
+		String got = codepage::decode(tmem, bytes, codepage::by_name("cp437"))!!;
+		assert(got == want, "cp437 decoding failed: got=%s, want=%s", got, want);
+
+		got = (String)codepage::encode(tmem, got[..], codepage::by_name("cp437"))!!;
+		assert(got == (String)bytes, "cp437 encoding failed: got=%s, want=%s",
+			got, (String)bytes);
+
+	};
+}
+
+fn void test_cp437_roundtrip() @test
+{
+    String s = "╔══ CP437: Café, π≈3.14 ══╗";
+    check_roundtrip(s, "cp437");
+}
+
+fn void test_cp850_roundtrip() @test
+{
+    String s = "CP850: Crème Brûlée, Frühstück, £10.50";
+    check_roundtrip(s, "cp850");
+}
+
+fn void test_cp866_roundtrip() @test
+{
+    String s = "CP866: Привет мир!";
+    check_roundtrip(s, "cp866");
+}
+
+fn void test_cp863_roundtrip() @test
+{
+    String s = "CP863: Québec, érable, Noël";
+    check_roundtrip(s, "cp863");
+}
+
+fn void test_iso8859_1() @test
+{
+    String s = "ISO-8859-1: Café, Ångström, Straße, Noël, £10.50";
+    check_roundtrip(s, "iso-8859-1");
+}
+
+fn void test_iso8859_2_polish() @test
+{
+    String s = "Polski: Zażółć gęślą jaźń";
+    check_roundtrip(s, "iso-8859-2");
+}
+
+fn void test_iso8859_2_czech() @test
+{
+    String s = "Česky: Příliš žluťoučký kůň úpěl ďábelské ódy";
+    check_roundtrip(s, "iso-8859-2");
+}
+
+fn void test_iso8859_2_hungarian() @test
+{
+    String s = "Magyar: Árvíztűrő tükörfúrógép";
+    check_roundtrip(s, "iso-8859-2");
+}
+
+fn void check_roundtrip(String utf8, String charset) => @pool()
+{
+	CodePage code_page = codepage::by_name(charset)!!;
+
+	// Encode UTF‑8 to code page.
+	char[] bytes = codepage::encode(tmem, utf8[..], code_page)!!;
+
+	// Decode back to UTF‑8 and compare.
+	String got = codepage::decode(tmem, bytes, code_page)!!;
+
+	assert(got == utf8,
+		"roundtrip failed: got=%s, want=%s", got, utf8);
+}
+