mirror of
https://github.com/c3lang/c3c.git
synced 2026-02-27 12:01:16 +00:00
* codepage: add single-byte code page support
Add std::encoding::codepage with a shared engine for converting between
single-byte code pages and UTF-8 using table-driven mappings.
Introduce generated tables and wrappers for several code pages[1] each
exposing encode/decode helpers built on a common CodePageTable
structure.
The mapping data is generated by cpgen[2] from the Unicode Consortium’s
published code page mapping files and follows the Unicode standard’s
interpretation of control characters (abstract characters) rather than
historical VGA glyph shapes.
[1] Code page overview/groups:
DOS/OEM code pages (legacy PC):
cp437 cp737 cp775 cp850 cp852 cp855 cp857 cp860 cp861 cp862 cp863
cp864 cp865 cp866 cp869 cp874
Windows code pages (ANSI/Windows):
cp1250 cp1251 cp1252 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258
ISO/IEC 8859 series (Latin/Regional):
iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5 iso_8859_6
iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11 iso_8859_13
iso_8859_14 iso_8859_15 iso_8859_16
[2] github.com/konimarti/cpgen
Signed-off-by: Koni Marti <koni.marti@gmail.com>
* codepage: change encoding format, streamline api
* Use enum to collect the data.
---------
Signed-off-by: Koni Marti <koni.marti@gmail.com>
Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
93 lines
2.8 KiB
Plaintext
93 lines
2.8 KiB
Plaintext
// Copyright (c) 2026 Koni Marti. All rights reserved.
|
||
// Use of this source code is governed by the MIT license.
|
||
module std::encoding::codepage_test;
|
||
import std::encoding::codepage;
|
||
|
||
fn void test_cp437() @test
|
||
{
|
||
String want = "╔══════════════════════════════════════╗"
|
||
"║ SYSTEM STATUS: OK - 25°C ± 2°C ║"
|
||
"║ Café Menu: Crème Brûlée .... £5.00 ║"
|
||
"╚══════════════════════════════════════╝";
|
||
|
||
char[] bytes =
|
||
x"C9CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBB"
|
||
x"BA202053595354454D205354415455533A204F4B202D203235F84320F12032F843202020202020BA"
|
||
x"BA202043616682204D656E753A2043728A6D65204272966C8265202E2E2E2E209C352E30302020BA"
|
||
x"C8CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBC";
|
||
|
||
@pool()
|
||
{
|
||
String got = codepage::decode(tmem, bytes, codepage::by_name("cp437"))!!;
|
||
assert(got == want, "cp437 decoding failed: got=%s, want=%s", got, want);
|
||
|
||
got = (String)codepage::encode(tmem, got[..], codepage::by_name("cp437"))!!;
|
||
assert(got == (String)bytes, "cp437 encoding failed: got=%s, want=%s",
|
||
got, (String)bytes);
|
||
|
||
};
|
||
}
|
||
|
||
fn void test_cp437_roundtrip() @test
|
||
{
|
||
String s = "╔══ CP437: Café, π≈3.14 ══╗";
|
||
check_roundtrip(s, "cp437");
|
||
}
|
||
|
||
fn void test_cp850_roundtrip() @test
|
||
{
|
||
String s = "CP850: Crème Brûlée, Frühstück, £10.50";
|
||
check_roundtrip(s, "cp850");
|
||
}
|
||
|
||
fn void test_cp866_roundtrip() @test
|
||
{
|
||
String s = "CP866: Привет мир!";
|
||
check_roundtrip(s, "cp866");
|
||
}
|
||
|
||
fn void test_cp863_roundtrip() @test
|
||
{
|
||
String s = "CP863: Québec, érable, Noël";
|
||
check_roundtrip(s, "cp863");
|
||
}
|
||
|
||
fn void test_iso8859_1() @test
|
||
{
|
||
String s = "ISO-8859-1: Café, Ångström, Straße, Noël, £10.50";
|
||
check_roundtrip(s, "iso-8859-1");
|
||
}
|
||
|
||
fn void test_iso8859_2_polish() @test
|
||
{
|
||
String s = "Polski: Zażółć gęślą jaźń";
|
||
check_roundtrip(s, "iso-8859-2");
|
||
}
|
||
|
||
fn void test_iso8859_2_czech() @test
|
||
{
|
||
String s = "Česky: Příliš žluťoučký kůň úpěl ďábelské ódy";
|
||
check_roundtrip(s, "iso-8859-2");
|
||
}
|
||
|
||
fn void test_iso8859_2_hungarian() @test
|
||
{
|
||
String s = "Magyar: Árvíztűrő tükörfúrógép";
|
||
check_roundtrip(s, "iso-8859-2");
|
||
}
|
||
|
||
fn void check_roundtrip(String utf8, String charset) => @pool()
|
||
{
|
||
CodePage code_page = codepage::by_name(charset)!!;
|
||
|
||
// Encode UTF‑8 to code page.
|
||
char[] bytes = codepage::encode(tmem, utf8[..], code_page)!!;
|
||
|
||
// Decode back to UTF‑8 and compare.
|
||
String got = codepage::decode(tmem, bytes, code_page)!!;
|
||
|
||
assert(got == utf8,
|
||
"roundtrip failed: got=%s, want=%s", got, utf8);
|
||
}
|
||
|