Files
c3c/lib/std/encoding/codepage.c3
konimarti 40e6a2c4a3 codepage: add single-byte code page support (#2891)
* codepage: add single-byte code page support

Add std::encoding::codepage with a shared engine for converting between
single-byte code pages and UTF-8 using table-driven mappings.

Introduce generated tables and wrappers for several code pages[1] each
exposing encode/decode helpers built on a common CodePageTable
structure.

The mapping data is generated by cpgen[2] from the Unicode Consortium’s
published code page mapping files and follows the Unicode standard’s
interpretation of control characters (abstract characters) rather than
historical VGA glyph shapes.

[1] Code page overview/groups:

    DOS/OEM code pages (legacy PC):
    cp437 cp737 cp775 cp850 cp852 cp855 cp857 cp860 cp861 cp862 cp863
    cp864 cp865 cp866 cp869 cp874

    Windows code pages (ANSI/Windows):
    cp1250 cp1251 cp1252 cp1253 cp1254 cp1255 cp1256 cp1257 cp1258

    ISO/IEC 8859 series (Latin/Regional):
    iso_8859_1 iso_8859_2 iso_8859_3 iso_8859_4 iso_8859_5 iso_8859_6
    iso_8859_7 iso_8859_8 iso_8859_9 iso_8859_10 iso_8859_11 iso_8859_13
    iso_8859_14 iso_8859_15 iso_8859_16

[2] github.com/konimarti/cpgen

Signed-off-by: Koni Marti <koni.marti@gmail.com>

* codepage: change encoding format, streamline api

* Use enum to collect the data.

---------

Signed-off-by: Koni Marti <koni.marti@gmail.com>
Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
2026-02-11 01:10:12 +01:00

244 lines
8.2 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright (c) 2026 Koni Marti. All rights reserved.
// Use of this source code is governed by the MIT license.
<*
Module providing generic singlebyte code page to UTF8 conversion.
This module implements a compact, tabledriven approach for single-byte
(8bit) encodings (e.g. CP437, CP850, CP866, CP125x). It is designed so
that each concrete code page only needs to supply a small, static
mapping table; the conversion logic is shared.
The design has two main goals:
- Fast decode from code page to UTF8 with a single table lookup per byte.
- Memoryefficient encode from UTF8 to code page without a large
Unicodetobyte array (no 64k reverse map per code page).
The design of CodePageTable and the packed reverse mapping is conceptually
similar to golang.org/x/text/encoding/charmap.
*>
module std::encoding::codepage;
import std::sort;
<*
Default replacement byte used when encoding from UTF8 to a singlebyte
code page and a Unicode scalar cannot be represented.
By convention, 0x1A is the ASCII/IBM SUB (substitute) control character.
*>
const char REPLACEMENT_CHAR = 0x1a;
<*
CodePageTable contains the bidirectional mapping tables for a singlebyte code
page in a compact packed form.
to_codepoint is the forward map from codepage byte (0x000xFF) to its UTF8
sequence. The array index is the raw byte value, each entry occupying 4 bytes:
- Byte 0 is the length of the UTF8 sequence (04)
- Bytes 1:len are the UTF8 bytes for the mapped Unicode scalar
The table therefore uses 256 * 4 bytes and is stored as a flat
char[1024] array, where entry i starts at offset i*4.
from_codepoint is the reverse map from Unicode scalar value to codepage byte,
also stored as a packed char[1024] array. It contains 256 entries of 4 bytes
each, where each 4byte chunk is interpreted as a littleendian uint
with the following packing scheme:
high 8 bits = codepage byte value (0x000xFF)
low 24 bits = Unicode scalar value (code point)
In other words:
entry = (byte_value << 24) | codepoint;
Ordering:
The 256 packed uint entries in from_codepoint are sorted by the low 24 bits
(code points). This allows binary search over Unicode scalar values without
a 64k reverselookup table. For any given code page, there are at most
256 mappings, so a log2(256) or 8 step search is sufficient.
*>
struct CodePageTable
{
char[1024] to_codepoint;
char[1024] from_codepoint;
}
enum CodePage : (String name, CodePageTable* table)
{
CP1250 = { "cp1250", &codepage::CP1250 },
CP1251 = { "cp1251", &codepage::CP1251 },
CP1252 = { "cp1252", &codepage::CP1252 },
CP1253 = { "cp1253", &codepage::CP1253 },
CP1254 = { "cp1254", &codepage::CP1254 },
CP1255 = { "cp1255", &codepage::CP1255 },
CP1256 = { "cp1256", &codepage::CP1256 },
CP1257 = { "cp1257", &codepage::CP1257 },
CP1258 = { "cp1258", &codepage::CP1258 },
CP437 = { "cp437", &codepage::CP437 },
CP737 = { "cp737", &codepage::CP737 },
CP775 = { "cp775", &codepage::CP775 },
CP850 = { "cp850", &codepage::CP850 },
CP852 = { "cp852", &codepage::CP852 },
CP855 = { "cp855", &codepage::CP855 },
CP857 = { "cp857", &codepage::CP857 },
CP860 = { "cp860", &codepage::CP860 },
CP861 = { "cp861", &codepage::CP861 },
CP862 = { "cp862", &codepage::CP862 },
CP863 = { "cp863", &codepage::CP863 },
CP864 = { "cp864", &codepage::CP864 },
CP865 = { "cp865", &codepage::CP865 },
CP866 = { "cp866", &codepage::CP866 },
CP869 = { "cp869", &codepage::CP869 },
CP874 = { "cp874", &codepage::CP874 },
ISO_8859_1 = { "iso-8859-1", &codepage::ISO_8859_1 },
ISO_8859_10 = { "iso-8859-10", &codepage::ISO_8859_10 },
ISO_8859_11 = { "iso-8859-11", &codepage::ISO_8859_11 },
ISO_8859_13 = { "iso-8859-13", &codepage::ISO_8859_13 },
ISO_8859_14 = { "iso-8859-14", &codepage::ISO_8859_14 },
ISO_8859_15 = { "iso-8859-15", &codepage::ISO_8859_15 },
ISO_8859_16 = { "iso-8859-16", &codepage::ISO_8859_16 },
ISO_8859_2 = { "iso-8859-2", &codepage::ISO_8859_2 },
ISO_8859_3 = { "iso-8859-3", &codepage::ISO_8859_3 },
ISO_8859_4 = { "iso-8859-4", &codepage::ISO_8859_4 },
ISO_8859_5 = { "iso-8859-5", &codepage::ISO_8859_5 },
ISO_8859_6 = { "iso-8859-6", &codepage::ISO_8859_6 },
ISO_8859_7 = { "iso-8859-7", &codepage::ISO_8859_7 },
ISO_8859_8 = { "iso-8859-8", &codepage::ISO_8859_8 },
ISO_8859_9 = { "iso-8859-9", &codepage::ISO_8859_9 },
US_ASCII = { "us-ascii", &codepage::US_ASCII },
}
<*
Returns a CodePage for the given charset name.
@param [in] charset_name : "A name, case insensitive, using _ or - for separator"
@return "The CodePage for the name"
@return? NOT_FOUND : "If the charset is unknown or unsupported"
*>
fn CodePage? by_name(String charset_name) => @pool()
{
String name = charset_name.treplace("_","-");
name.convert_to_lower();
foreach (page : CodePage.values)
{
if (page.name == charset_name) return page;
}
return NOT_FOUND~;
}
fn String? decode(Allocator allocator, char[] src, CodePage code_page)
{
char[] dst = allocator::alloc_array(allocator, char, decode_len(src, code_page));
return decode_buffer(src, dst, code_page);
}
<*
Decode a code-page byte buffer into a UTF8 string.
@param src : "Input byte array in the given code page."
@param dst : "Destination output string in UTF-8."
@param code_page : "Code page for this encoding."
@return "String in UTF-8."
*>
fn String? decode_buffer(char[] src, char[] dst, CodePage code_page)
{
usz n = 0;
CodePageTable *table = code_page.table;
foreach (c: src)
{
usz pos = (usz)c * 4;
char len = table.to_codepoint[pos];
dst[n:len] = table.to_codepoint[pos+1:len];
n += len;
}
return (String)dst[:n];
}
fn char[]? encode(Allocator allocator, char[] src, CodePage code_page, char replacement = REPLACEMENT_CHAR)
{
char[] dst = allocator::alloc_array(allocator, char, encode_len(src));
return encode_buffer(src, dst, code_page, replacement);
}
const uint MASK @private = (1u << 24) - 1;
<*
Encode a UTF8 string into a singlebyte code page.
@param src : "Input byte array in UTF-8"
@param dst : "Destination output byte array in the target code page"
@param code_page : "Code page for this encoding."
@param replacement : "Byte to emit when Unicode scalar cannot be represented in the target code page."
@return "Byte array in the given code page."
*>
fn char[]? encode_buffer(char[] src, char[] dst, CodePage code_page, char replacement = REPLACEMENT_CHAR)
{
// Unpack the packed reverse table once into a local uint[256] view.
uint[256] from_map;
CodePageTable *table = code_page.table;
for (usz i = 0; i < 256; i++)
{
UIntLE *val = (UIntLE*)&table.from_codepoint[i * 4];
from_map[i] = mem::load(val, 1).val;
}
usz out = 0;
usz n = src.len;
for (usz i = 0; i < n; )
{
usz rem = n - i;
if (rem > 4) rem = 4;
Char32 codepoint = conv::utf8_to_char32(&src[i], &rem)!;
i += rem;
// Binary search for codepoint in low 24 bits of each entry.
// Returned index is between [0..from_map.len).
usz index = sort::binarysearch(from_map[..], (uint)codepoint, fn int(uint lhs, uint rhs) => (int)(lhs & MASK) - (int)(rhs & MASK));
uint entry = from_map[index];
if ((entry & MASK) == (uint)codepoint)
{
char b = (char)(entry >> 24);
dst[out++] = b;
}
else
{
dst[out++] = replacement;
}
}
return dst[:out];
}
<*
Compute the number of UTF8 bytes produced when decoding src with the given
code page table.
@param src : "Input byte array in the given code page."
@param code_page : "Code page for this encoding."
*>
fn usz decode_len(char[] src, CodePage code_page) @inline
{
usz n;
CodePageTable *table = code_page.table;
foreach (usz c: src) n += table.to_codepoint[c *4 ];
return n;
}
<*
Compute the number of output bytes produced when
encoding src from UTF8 to a singlebyte code page.
@param src : "Input byte array in UTF-8"
*>
fn usz encode_len(char[] src) @inline
{
return conv::utf8_codepoints((String)src);
}