codepage: add single-byte code page support (#2891)

* codepage: add single-byte code page support

Add std::encoding::codepage with a shared engine for converting between
single-byte code pages and UTF-8 using table-driven mappings.

Introduce generated tables and wrappers for several code pages[1] each
exposing encode/decode helpers built on a common CodePageTable
structure.

The mapping data is generated by cpgen[2] from the Unicode Consortium’s
published code page mapping files and follows the Unicode standard’s
interpretation of control characters (abstract characters) rather than
historical VGA glyph shapes.

[1] Code page overview/groups:

    DOS/OEM code pages (legacy PC):
    cp437 cp737 cp775 cp850 cp852 cp855 cp857 cp860 cp861 cp862 cp863
    cp864 cp865 cp866 cp869 cp874

    Windows code pages (ANSI/Windows):
    cp1250 cp1251 cp1252 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258

    ISO/IEC 8859 series (Latin/Regional):
    iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5 iso_8859_6
    iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11 iso_8859_13
    iso_8859_14 iso_8859_15 iso_8859_16

[2] github.com/konimarti/cpgen

Signed-off-by: Koni Marti <koni.marti@gmail.com>

* codepage: change encoding format, streamline api

* Use enum to collect the data.

---------

Signed-off-by: Koni Marti <koni.marti@gmail.com>
Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
This commit is contained in:
konimarti
2026-02-11 01:10:12 +01:00
committed by GitHub
parent a80e40a798
commit 40e6a2c4a3
4 changed files with 2570 additions and 0 deletions

View File

@@ -0,0 +1,92 @@
// Copyright (c) 2026 Koni Marti. All rights reserved.
// Use of this source code is governed by the MIT license.
module std::encoding::codepage_test;
import std::encoding::codepage;
fn void test_cp437() @test
{
String want = "╔══════════════════════════════════════╗"
"║ SYSTEM STATUS: OK - 25°C ± 2°C ║"
"║ Café Menu: Crème Brûlée .... £5.00 ║"
"╚══════════════════════════════════════╝";
char[] bytes =
x"C9CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBB"
x"BA202053595354454D205354415455533A204F4B202D203235F84320F12032F843202020202020BA"
x"BA202043616682204D656E753A2043728A6D65204272966C8265202E2E2E2E209C352E30302020BA"
x"C8CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBC";
@pool()
{
String got = codepage::decode(tmem, bytes, codepage::by_name("cp437"))!!;
assert(got == want, "cp437 decoding failed: got=%s, want=%s", got, want);
got = (String)codepage::encode(tmem, got[..], codepage::by_name("cp437"))!!;
assert(got == (String)bytes, "cp437 encoding failed: got=%s, want=%s",
got, (String)bytes);
};
}
fn void test_cp437_roundtrip() @test
{
String s = "╔══ CP437: Café, π≈3.14 ══╗";
check_roundtrip(s, "cp437");
}
fn void test_cp850_roundtrip() @test
{
String s = "CP850: Crème Brûlée, Frühstück, £10.50";
check_roundtrip(s, "cp850");
}
fn void test_cp866_roundtrip() @test
{
String s = "CP866: Привет мир!";
check_roundtrip(s, "cp866");
}
fn void test_cp863_roundtrip() @test
{
String s = "CP863: Québec, érable, Noël";
check_roundtrip(s, "cp863");
}
fn void test_iso8859_1() @test
{
String s = "ISO-8859-1: Café, Ångström, Straße, Noël, £10.50";
check_roundtrip(s, "iso-8859-1");
}
fn void test_iso8859_2_polish() @test
{
String s = "Polski: Zażółć gęślą jaźń";
check_roundtrip(s, "iso-8859-2");
}
fn void test_iso8859_2_czech() @test
{
String s = "Česky: Příliš žluťoučký kůň úpěl ďábelské ódy";
check_roundtrip(s, "iso-8859-2");
}
fn void test_iso8859_2_hungarian() @test
{
String s = "Magyar: Árvíztűrő tükörfúrógép";
check_roundtrip(s, "iso-8859-2");
}
fn void check_roundtrip(String utf8, String charset) => @pool()
{
CodePage code_page = codepage::by_name(charset)!!;
// Encode UTF8 to code page.
char[] bytes = codepage::encode(tmem, utf8[..], code_page)!!;
// Decode back to UTF8 and compare.
String got = codepage::decode(tmem, bytes, code_page)!!;
assert(got == utf8,
"roundtrip failed: got=%s, want=%s", got, utf8);
}