mirror of
https://github.com/c3lang/c3c.git
synced 2026-02-27 03:51:18 +00:00
codepage: add single-byte code page support (#2891)
* codepage: add single-byte code page support
Add std::encoding::codepage with a shared engine for converting between
single-byte code pages and UTF-8 using table-driven mappings.
Introduce generated tables and wrappers for several code pages[1] each
exposing encode/decode helpers built on a common CodePageTable
structure.
The mapping data is generated by cpgen[2] from the Unicode Consortium’s
published code page mapping files and follows the Unicode standard’s
interpretation of control characters (abstract characters) rather than
historical VGA glyph shapes.
[1] Code page overview/groups:
DOS/OEM code pages (legacy PC):
cp437 cp737 cp775 cp850 cp852 cp855 cp857 cp860 cp861 cp862 cp863
cp864 cp865 cp866 cp869 cp874
Windows code pages (ANSI/Windows):
cp1250 cp1251 cp1252 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258
ISO/IEC 8859 series (Latin/Regional):
iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5 iso_8859_6
iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11 iso_8859_13
iso_8859_14 iso_8859_15 iso_8859_16
[2] github.com/konimarti/cpgen
Signed-off-by: Koni Marti <koni.marti@gmail.com>
* codepage: change encoding format, streamline api
* Use enum to collect the data.
---------
Signed-off-by: Koni Marti <koni.marti@gmail.com>
Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
This commit is contained in:
243
lib/std/encoding/codepage.c3
Normal file
243
lib/std/encoding/codepage.c3
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
// Copyright (c) 2026 Koni Marti. All rights reserved.
|
||||||
|
// Use of this source code is governed by the MIT license.
|
||||||
|
<*
|
||||||
|
Module providing generic single‑byte code page to UTF‑8 conversion.
|
||||||
|
|
||||||
|
This module implements a compact, table‑driven approach for single-byte
|
||||||
|
(8‑bit) encodings (e.g. CP437, CP850, CP866, CP125x). It is designed so
|
||||||
|
that each concrete code page only needs to supply a small, static
|
||||||
|
mapping table; the conversion logic is shared.
|
||||||
|
|
||||||
|
The design has two main goals:
|
||||||
|
|
||||||
|
- Fast decode from code page to UTF‑8 with a single table lookup per byte.
|
||||||
|
- Memory‑efficient encode from UTF‑8 to code page without a large
|
||||||
|
Unicode‑to‑byte array (no 64k reverse map per code page).
|
||||||
|
|
||||||
|
The design of CodePageTable and the packed reverse mapping is conceptually
|
||||||
|
similar to golang.org/x/text/encoding/charmap.
|
||||||
|
|
||||||
|
*>
|
||||||
|
module std::encoding::codepage;
|
||||||
|
import std::sort;
|
||||||
|
|
||||||
|
<*
|
||||||
|
Default replacement byte used when encoding from UTF‑8 to a single‑byte
|
||||||
|
code page and a Unicode scalar cannot be represented.
|
||||||
|
|
||||||
|
By convention, 0x1A is the ASCII/IBM SUB (substitute) control character.
|
||||||
|
*>
|
||||||
|
const char REPLACEMENT_CHAR = 0x1a;
|
||||||
|
|
||||||
|
<*
|
||||||
|
CodePageTable contains the bidirectional mapping tables for a single‑byte code
|
||||||
|
page in a compact packed form.
|
||||||
|
|
||||||
|
to_codepoint is the forward map from code‑page byte (0x00–0xFF) to its UTF‑8
|
||||||
|
sequence. The array index is the raw byte value, each entry occupying 4 bytes:
|
||||||
|
|
||||||
|
- Byte 0 is the length of the UTF‑8 sequence (0–4)
|
||||||
|
- Bytes 1:len are the UTF‑8 bytes for the mapped Unicode scalar
|
||||||
|
|
||||||
|
The table therefore uses 256 * 4 bytes and is stored as a flat
|
||||||
|
char[1024] array, where entry i starts at offset i*4.
|
||||||
|
|
||||||
|
from_codepoint is the reverse map from Unicode scalar value to code‑page byte,
|
||||||
|
also stored as a packed char[1024] array. It contains 256 entries of 4 bytes
|
||||||
|
each, where each 4‑byte chunk is interpreted as a little‑endian uint
|
||||||
|
with the following packing scheme:
|
||||||
|
|
||||||
|
high 8 bits = code‑page byte value (0x00–0xFF)
|
||||||
|
low 24 bits = Unicode scalar value (code point)
|
||||||
|
|
||||||
|
In other words:
|
||||||
|
|
||||||
|
entry = (byte_value << 24) | codepoint;
|
||||||
|
|
||||||
|
Ordering:
|
||||||
|
The 256 packed uint entries in from_codepoint are sorted by the low 24 bits
|
||||||
|
(code points). This allows binary search over Unicode scalar values without
|
||||||
|
a 64k reverse‑lookup table. For any given code page, there are at most
|
||||||
|
256 mappings, so a log2(256) or 8 step search is sufficient.
|
||||||
|
*>
|
||||||
|
struct CodePageTable
|
||||||
|
{
|
||||||
|
char[1024] to_codepoint;
|
||||||
|
char[1024] from_codepoint;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum CodePage : (String name, CodePageTable* table)
|
||||||
|
{
|
||||||
|
CP1250 = { "cp1250", &codepage::CP1250 },
|
||||||
|
CP1251 = { "cp1251", &codepage::CP1251 },
|
||||||
|
CP1252 = { "cp1252", &codepage::CP1252 },
|
||||||
|
CP1253 = { "cp1253", &codepage::CP1253 },
|
||||||
|
CP1254 = { "cp1254", &codepage::CP1254 },
|
||||||
|
CP1255 = { "cp1255", &codepage::CP1255 },
|
||||||
|
CP1256 = { "cp1256", &codepage::CP1256 },
|
||||||
|
CP1257 = { "cp1257", &codepage::CP1257 },
|
||||||
|
CP1258 = { "cp1258", &codepage::CP1258 },
|
||||||
|
CP437 = { "cp437", &codepage::CP437 },
|
||||||
|
CP737 = { "cp737", &codepage::CP737 },
|
||||||
|
CP775 = { "cp775", &codepage::CP775 },
|
||||||
|
CP850 = { "cp850", &codepage::CP850 },
|
||||||
|
CP852 = { "cp852", &codepage::CP852 },
|
||||||
|
CP855 = { "cp855", &codepage::CP855 },
|
||||||
|
CP857 = { "cp857", &codepage::CP857 },
|
||||||
|
CP860 = { "cp860", &codepage::CP860 },
|
||||||
|
CP861 = { "cp861", &codepage::CP861 },
|
||||||
|
CP862 = { "cp862", &codepage::CP862 },
|
||||||
|
CP863 = { "cp863", &codepage::CP863 },
|
||||||
|
CP864 = { "cp864", &codepage::CP864 },
|
||||||
|
CP865 = { "cp865", &codepage::CP865 },
|
||||||
|
CP866 = { "cp866", &codepage::CP866 },
|
||||||
|
CP869 = { "cp869", &codepage::CP869 },
|
||||||
|
CP874 = { "cp874", &codepage::CP874 },
|
||||||
|
ISO_8859_1 = { "iso-8859-1", &codepage::ISO_8859_1 },
|
||||||
|
ISO_8859_10 = { "iso-8859-10", &codepage::ISO_8859_10 },
|
||||||
|
ISO_8859_11 = { "iso-8859-11", &codepage::ISO_8859_11 },
|
||||||
|
ISO_8859_13 = { "iso-8859-13", &codepage::ISO_8859_13 },
|
||||||
|
ISO_8859_14 = { "iso-8859-14", &codepage::ISO_8859_14 },
|
||||||
|
ISO_8859_15 = { "iso-8859-15", &codepage::ISO_8859_15 },
|
||||||
|
ISO_8859_16 = { "iso-8859-16", &codepage::ISO_8859_16 },
|
||||||
|
ISO_8859_2 = { "iso-8859-2", &codepage::ISO_8859_2 },
|
||||||
|
ISO_8859_3 = { "iso-8859-3", &codepage::ISO_8859_3 },
|
||||||
|
ISO_8859_4 = { "iso-8859-4", &codepage::ISO_8859_4 },
|
||||||
|
ISO_8859_5 = { "iso-8859-5", &codepage::ISO_8859_5 },
|
||||||
|
ISO_8859_6 = { "iso-8859-6", &codepage::ISO_8859_6 },
|
||||||
|
ISO_8859_7 = { "iso-8859-7", &codepage::ISO_8859_7 },
|
||||||
|
ISO_8859_8 = { "iso-8859-8", &codepage::ISO_8859_8 },
|
||||||
|
ISO_8859_9 = { "iso-8859-9", &codepage::ISO_8859_9 },
|
||||||
|
US_ASCII = { "us-ascii", &codepage::US_ASCII },
|
||||||
|
}
|
||||||
|
|
||||||
|
<*
|
||||||
|
Returns a CodePage for the given charset name.
|
||||||
|
|
||||||
|
@param [in] charset_name : "A name, case insensitive, using _ or - for separator"
|
||||||
|
@return "The CodePage for the name"
|
||||||
|
|
||||||
|
@return? NOT_FOUND : "If the charset is unknown or unsupported"
|
||||||
|
*>
|
||||||
|
fn CodePage? by_name(String charset_name) => @pool()
|
||||||
|
{
|
||||||
|
String name = charset_name.treplace("_","-");
|
||||||
|
name.convert_to_lower();
|
||||||
|
foreach (page : CodePage.values)
|
||||||
|
{
|
||||||
|
if (page.name == charset_name) return page;
|
||||||
|
}
|
||||||
|
return NOT_FOUND~;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn String? decode(Allocator allocator, char[] src, CodePage code_page)
|
||||||
|
{
|
||||||
|
char[] dst = allocator::alloc_array(allocator, char, decode_len(src, code_page));
|
||||||
|
return decode_buffer(src, dst, code_page);
|
||||||
|
}
|
||||||
|
|
||||||
|
<*
|
||||||
|
Decode a code-page byte buffer into a UTF‑8 string.
|
||||||
|
|
||||||
|
@param src : "Input byte array in the given code page."
|
||||||
|
@param dst : "Destination output string in UTF-8."
|
||||||
|
@param code_page : "Code page for this encoding."
|
||||||
|
@return "String in UTF-8."
|
||||||
|
*>
|
||||||
|
fn String? decode_buffer(char[] src, char[] dst, CodePage code_page)
|
||||||
|
{
|
||||||
|
usz n = 0;
|
||||||
|
CodePageTable *table = code_page.table;
|
||||||
|
foreach (c: src)
|
||||||
|
{
|
||||||
|
usz pos = (usz)c * 4;
|
||||||
|
char len = table.to_codepoint[pos];
|
||||||
|
|
||||||
|
dst[n:len] = table.to_codepoint[pos+1:len];
|
||||||
|
n += len;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (String)dst[:n];
|
||||||
|
}
|
||||||
|
|
||||||
|
fn char[]? encode(Allocator allocator, char[] src, CodePage code_page, char replacement = REPLACEMENT_CHAR)
|
||||||
|
{
|
||||||
|
char[] dst = allocator::alloc_array(allocator, char, encode_len(src));
|
||||||
|
return encode_buffer(src, dst, code_page, replacement);
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint MASK @private = (1u << 24) - 1;
|
||||||
|
|
||||||
|
<*
|
||||||
|
Encode a UTF‑8 string into a single‑byte code page.
|
||||||
|
|
||||||
|
@param src : "Input byte array in UTF-8"
|
||||||
|
@param dst : "Destination output byte array in the target code page"
|
||||||
|
@param code_page : "Code page for this encoding."
|
||||||
|
@param replacement : "Byte to emit when Unicode scalar cannot be represented in the target code page."
|
||||||
|
@return "Byte array in the given code page."
|
||||||
|
*>
|
||||||
|
fn char[]? encode_buffer(char[] src, char[] dst, CodePage code_page, char replacement = REPLACEMENT_CHAR)
|
||||||
|
{
|
||||||
|
// Unpack the packed reverse table once into a local uint[256] view.
|
||||||
|
uint[256] from_map;
|
||||||
|
CodePageTable *table = code_page.table;
|
||||||
|
for (usz i = 0; i < 256; i++)
|
||||||
|
{
|
||||||
|
UIntLE *val = (UIntLE*)&table.from_codepoint[i * 4];
|
||||||
|
from_map[i] = mem::load(val, 1).val;
|
||||||
|
}
|
||||||
|
|
||||||
|
usz out = 0;
|
||||||
|
usz n = src.len;
|
||||||
|
for (usz i = 0; i < n; )
|
||||||
|
{
|
||||||
|
usz rem = n - i;
|
||||||
|
if (rem > 4) rem = 4;
|
||||||
|
|
||||||
|
Char32 codepoint = conv::utf8_to_char32(&src[i], &rem)!;
|
||||||
|
i += rem;
|
||||||
|
|
||||||
|
// Binary search for codepoint in low 24 bits of each entry.
|
||||||
|
// Returned index is between [0..from_map.len).
|
||||||
|
usz index = sort::binarysearch(from_map[..], (uint)codepoint, fn int(uint lhs, uint rhs) => (int)(lhs & MASK) - (int)(rhs & MASK));
|
||||||
|
|
||||||
|
uint entry = from_map[index];
|
||||||
|
if ((entry & MASK) == (uint)codepoint)
|
||||||
|
{
|
||||||
|
char b = (char)(entry >> 24);
|
||||||
|
dst[out++] = b;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
dst[out++] = replacement;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return dst[:out];
|
||||||
|
}
|
||||||
|
|
||||||
|
<*
|
||||||
|
Compute the number of UTF‑8 bytes produced when decoding src with the given
|
||||||
|
code page table.
|
||||||
|
@param src : "Input byte array in the given code page."
|
||||||
|
@param code_page : "Code page for this encoding."
|
||||||
|
*>
|
||||||
|
fn usz decode_len(char[] src, CodePage code_page) @inline
|
||||||
|
{
|
||||||
|
usz n;
|
||||||
|
CodePageTable *table = code_page.table;
|
||||||
|
foreach (usz c: src) n += table.to_codepoint[c *4 ];
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
<*
|
||||||
|
Compute the number of output bytes produced when
|
||||||
|
encoding src from UTF‑8 to a single‑byte code page.
|
||||||
|
@param src : "Input byte array in UTF-8"
|
||||||
|
*>
|
||||||
|
fn usz encode_len(char[] src) @inline
|
||||||
|
{
|
||||||
|
return conv::utf8_codepoints((String)src);
|
||||||
|
}
|
||||||
|
|
||||||
2234
lib/std/encoding/codepage_private.c3
Normal file
2234
lib/std/encoding/codepage_private.c3
Normal file
File diff suppressed because it is too large
Load Diff
@@ -17,6 +17,7 @@
|
|||||||
- Add Murmur3 hash.
|
- Add Murmur3 hash.
|
||||||
- Add optional line-length limitations to `io::readline` and `io::readline_to_stream`. #2879
|
- Add optional line-length limitations to `io::readline` and `io::readline_to_stream`. #2879
|
||||||
- Add Xorshiro128++.
|
- Add Xorshiro128++.
|
||||||
|
- Add single-byte code page support (DOS/OEM, Windows/ANSI, and ISO/IEC 8859).
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
- Add error message if directory with output file name already exists
|
- Add error message if directory with output file name already exists
|
||||||
|
|||||||
92
test/unit/stdlib/encoding/codepage.c3
Normal file
92
test/unit/stdlib/encoding/codepage.c3
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
// Copyright (c) 2026 Koni Marti. All rights reserved.
|
||||||
|
// Use of this source code is governed by the MIT license.
|
||||||
|
module std::encoding::codepage_test;
|
||||||
|
import std::encoding::codepage;
|
||||||
|
|
||||||
|
fn void test_cp437() @test
|
||||||
|
{
|
||||||
|
String want = "╔══════════════════════════════════════╗"
|
||||||
|
"║ SYSTEM STATUS: OK - 25°C ± 2°C ║"
|
||||||
|
"║ Café Menu: Crème Brûlée .... £5.00 ║"
|
||||||
|
"╚══════════════════════════════════════╝";
|
||||||
|
|
||||||
|
char[] bytes =
|
||||||
|
x"C9CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBB"
|
||||||
|
x"BA202053595354454D205354415455533A204F4B202D203235F84320F12032F843202020202020BA"
|
||||||
|
x"BA202043616682204D656E753A2043728A6D65204272966C8265202E2E2E2E209C352E30302020BA"
|
||||||
|
x"C8CDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDCDBC";
|
||||||
|
|
||||||
|
@pool()
|
||||||
|
{
|
||||||
|
String got = codepage::decode(tmem, bytes, codepage::by_name("cp437"))!!;
|
||||||
|
assert(got == want, "cp437 decoding failed: got=%s, want=%s", got, want);
|
||||||
|
|
||||||
|
got = (String)codepage::encode(tmem, got[..], codepage::by_name("cp437"))!!;
|
||||||
|
assert(got == (String)bytes, "cp437 encoding failed: got=%s, want=%s",
|
||||||
|
got, (String)bytes);
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void test_cp437_roundtrip() @test
|
||||||
|
{
|
||||||
|
String s = "╔══ CP437: Café, π≈3.14 ══╗";
|
||||||
|
check_roundtrip(s, "cp437");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void test_cp850_roundtrip() @test
|
||||||
|
{
|
||||||
|
String s = "CP850: Crème Brûlée, Frühstück, £10.50";
|
||||||
|
check_roundtrip(s, "cp850");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void test_cp866_roundtrip() @test
|
||||||
|
{
|
||||||
|
String s = "CP866: Привет мир!";
|
||||||
|
check_roundtrip(s, "cp866");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void test_cp863_roundtrip() @test
|
||||||
|
{
|
||||||
|
String s = "CP863: Québec, érable, Noël";
|
||||||
|
check_roundtrip(s, "cp863");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void test_iso8859_1() @test
|
||||||
|
{
|
||||||
|
String s = "ISO-8859-1: Café, Ångström, Straße, Noël, £10.50";
|
||||||
|
check_roundtrip(s, "iso-8859-1");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void test_iso8859_2_polish() @test
|
||||||
|
{
|
||||||
|
String s = "Polski: Zażółć gęślą jaźń";
|
||||||
|
check_roundtrip(s, "iso-8859-2");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void test_iso8859_2_czech() @test
|
||||||
|
{
|
||||||
|
String s = "Česky: Příliš žluťoučký kůň úpěl ďábelské ódy";
|
||||||
|
check_roundtrip(s, "iso-8859-2");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void test_iso8859_2_hungarian() @test
|
||||||
|
{
|
||||||
|
String s = "Magyar: Árvíztűrő tükörfúrógép";
|
||||||
|
check_roundtrip(s, "iso-8859-2");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn void check_roundtrip(String utf8, String charset) => @pool()
|
||||||
|
{
|
||||||
|
CodePage code_page = codepage::by_name(charset)!!;
|
||||||
|
|
||||||
|
// Encode UTF‑8 to code page.
|
||||||
|
char[] bytes = codepage::encode(tmem, utf8[..], code_page)!!;
|
||||||
|
|
||||||
|
// Decode back to UTF‑8 and compare.
|
||||||
|
String got = codepage::decode(tmem, bytes, code_page)!!;
|
||||||
|
|
||||||
|
assert(got == utf8,
|
||||||
|
"roundtrip failed: got=%s, want=%s", got, utf8);
|
||||||
|
}
|
||||||
|
|
||||||
Reference in New Issue
Block a user