Files
c3c/test/unit/stdlib/encoding/codepage.c3
konimarti 40e6a2c4a3 codepage: add single-byte code page support (#2891)
* codepage: add single-byte code page support

Add std::encoding::codepage with a shared engine for converting between
single-byte code pages and UTF-8 using table-driven mappings.

Introduce generated tables and wrappers for several code pages[1] each
exposing encode/decode helpers built on a common CodePageTable
structure.

The mapping data is generated by cpgen[2] from the Unicode Consortium’s
published code page mapping files and follows the Unicode standard’s
interpretation of control characters (abstract characters) rather than
historical VGA glyph shapes.

[1] Code page overview/groups:

    DOS/OEM code pages (legacy PC):
    cp437 cp737 cp775 cp850 cp852 cp855 cp857 cp860 cp861 cp862 cp863
    cp864 cp865 cp866 cp869 cp874

    Windows code pages (ANSI/Windows):
    cp1250 cp1251 cp1252 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258

    ISO/IEC 8859 series (Latin/Regional):
    iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5 iso_8859_6
    iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11 iso_8859_13
    iso_8859_14 iso_8859_15 iso_8859_16

[2] github.com/konimarti/cpgen

Signed-off-by: Koni Marti <koni.marti@gmail.com>

* codepage: change encoding format, streamline api

* Use enum to collect the data.

---------

Signed-off-by: Koni Marti <koni.marti@gmail.com>
Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
2026-02-11 01:10:12 +01:00

93 lines
2.8 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright (c) 2026 Koni Marti. All rights reserved.
// Use of this source code is governed by the MIT license.
module std::encoding::codepage_test;
import std::encoding::codepage;
fn void test_cp437() @test
{
String want = "╔══════════════════════════════════════╗"
"║ SYSTEM STATUS: OK - 25°C ± 2°C ║"
"║ Café Menu: Crème Brûlée .... £5.00 ║"
"╚══════════════════════════════════════╝";
char[] bytes =
x"C9CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBB"
x"BA202053595354454D205354415455533A204F4B202D203235F84320F12032F843202020202020BA"
x"BA202043616682204D656E753A2043728A6D65204272966C8265202E2E2E2E209C352E30302020BA"
x"C8CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBC";
@pool()
{
String got = codepage::decode(tmem, bytes, codepage::by_name("cp437"))!!;
assert(got == want, "cp437 decoding failed: got=%s, want=%s", got, want);
got = (String)codepage::encode(tmem, got[..], codepage::by_name("cp437"))!!;
assert(got == (String)bytes, "cp437 encoding failed: got=%s, want=%s",
got, (String)bytes);
};
}
fn void test_cp437_roundtrip() @test
{
String s = "╔══ CP437: Café, π≈3.14 ══╗";
check_roundtrip(s, "cp437");
}
fn void test_cp850_roundtrip() @test
{
String s = "CP850: Crème Brûlée, Frühstück, £10.50";
check_roundtrip(s, "cp850");
}
fn void test_cp866_roundtrip() @test
{
String s = "CP866: Привет мир!";
check_roundtrip(s, "cp866");
}
fn void test_cp863_roundtrip() @test
{
String s = "CP863: Québec, érable, Noël";
check_roundtrip(s, "cp863");
}
fn void test_iso8859_1() @test
{
String s = "ISO-8859-1: Café, Ångström, Straße, Noël, £10.50";
check_roundtrip(s, "iso-8859-1");
}
fn void test_iso8859_2_polish() @test
{
String s = "Polski: Zażółć gęślą jaźń";
check_roundtrip(s, "iso-8859-2");
}
fn void test_iso8859_2_czech() @test
{
String s = "Česky: Příliš žluťoučký kůň úpěl ďábelské ódy";
check_roundtrip(s, "iso-8859-2");
}
fn void test_iso8859_2_hungarian() @test
{
String s = "Magyar: Árvíztűrő tükörfúrógép";
check_roundtrip(s, "iso-8859-2");
}
fn void check_roundtrip(String utf8, String charset) => @pool()
{
CodePage code_page = codepage::by_name(charset)!!;
// Encode UTF8 to code page.
char[] bytes = codepage::encode(tmem, utf8[..], code_page)!!;
// Decode back to UTF8 and compare.
String got = codepage::decode(tmem, bytes, code_page)!!;
assert(got == utf8,
"roundtrip failed: got=%s, want=%s", got, utf8);
}