Files
c3c/lib/std/encoding/codepage.c3
Christoffer Lerno acc4a900f5 - New const enum declaration syntax.
- New enum associated value syntax.
2026-02-12 14:43:56 +01:00

244 lines
8.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright (c) 2026 Koni Marti. All rights reserved.
// Use of this source code is governed by the MIT license.
<*
Module providing generic singlebyte code page to UTF8 conversion.
This module implements a compact, tabledriven approach for single-byte
(8bit) encodings (e.g. CP437, CP850, CP866, CP125x). It is designed so
that each concrete code page only needs to supply a small, static
mapping table; the conversion logic is shared.
The design has two main goals:
- Fast decode from code page to UTF8 with a single table lookup per byte.
- Memoryefficient encode from UTF8 to code page without a large
Unicodetobyte array (no 64k reverse map per code page).
The design of CodePageTable and the packed reverse mapping is conceptually
similar to golang.org/x/text/encoding/charmap.
*>
module std::encoding::codepage;
import std::sort;
<*
Default replacement byte used when encoding from UTF8 to a singlebyte
code page and a Unicode scalar cannot be represented.
By convention, 0x1A is the ASCII/IBM SUB (substitute) control character.
*>
const char REPLACEMENT_CHAR = 0x1a;
<*
CodePageTable contains the bidirectional mapping tables for a singlebyte code
page in a compact packed form.
to_codepoint is the forward map from codepage byte (0x000xFF) to its UTF8
sequence. The array index is the raw byte value, each entry occupying 4 bytes:
- Byte 0 is the length of the UTF8 sequence (04)
- Bytes 1:len are the UTF8 bytes for the mapped Unicode scalar
The table therefore uses 256 * 4 bytes and is stored as a flat
char[1024] array, where entry i starts at offset i*4.
from_codepoint is the reverse map from Unicode scalar value to codepage byte,
also stored as a packed char[1024] array. It contains 256 entries of 4 bytes
each, where each 4byte chunk is interpreted as a littleendian uint
with the following packing scheme:
high 8 bits = codepage byte value (0x000xFF)
low 24 bits = Unicode scalar value (code point)
In other words:
entry = (byte_value << 24) | codepoint;
Ordering:
The 256 packed uint entries in from_codepoint are sorted by the low 24 bits
(code points). This allows binary search over Unicode scalar values without
a 64k reverselookup table. For any given code page, there are at most
256 mappings, so a log2(256) or 8 step search is sufficient.
*>
struct CodePageTable
{
char[1024] to_codepoint;
char[1024] from_codepoint;
}
enum CodePage : (String name, CodePageTable* table)
{
CP1250 { "cp1250", &codepage::CP1250 },
CP1251 { "cp1251", &codepage::CP1251 },
CP1252 { "cp1252", &codepage::CP1252 },
CP1253 { "cp1253", &codepage::CP1253 },
CP1254 { "cp1254", &codepage::CP1254 },
CP1255 { "cp1255", &codepage::CP1255 },
CP1256 { "cp1256", &codepage::CP1256 },
CP1257 { "cp1257", &codepage::CP1257 },
CP1258 { "cp1258", &codepage::CP1258 },
CP437 { "cp437", &codepage::CP437 },
CP737 { "cp737", &codepage::CP737 },
CP775 { "cp775", &codepage::CP775 },
CP850 { "cp850", &codepage::CP850 },
CP852 { "cp852", &codepage::CP852 },
CP855 { "cp855", &codepage::CP855 },
CP857 { "cp857", &codepage::CP857 },
CP860 { "cp860", &codepage::CP860 },
CP861 { "cp861", &codepage::CP861 },
CP862 { "cp862", &codepage::CP862 },
CP863 { "cp863", &codepage::CP863 },
CP864 { "cp864", &codepage::CP864 },
CP865 { "cp865", &codepage::CP865 },
CP866 { "cp866", &codepage::CP866 },
CP869 { "cp869", &codepage::CP869 },
CP874 { "cp874", &codepage::CP874 },
ISO_8859_1 { "iso-8859-1", &codepage::ISO_8859_1 },
ISO_8859_10 { "iso-8859-10", &codepage::ISO_8859_10 },
ISO_8859_11 { "iso-8859-11", &codepage::ISO_8859_11 },
ISO_8859_13 { "iso-8859-13", &codepage::ISO_8859_13 },
ISO_8859_14 { "iso-8859-14", &codepage::ISO_8859_14 },
ISO_8859_15 { "iso-8859-15", &codepage::ISO_8859_15 },
ISO_8859_16 { "iso-8859-16", &codepage::ISO_8859_16 },
ISO_8859_2 { "iso-8859-2", &codepage::ISO_8859_2 },
ISO_8859_3 { "iso-8859-3", &codepage::ISO_8859_3 },
ISO_8859_4 { "iso-8859-4", &codepage::ISO_8859_4 },
ISO_8859_5 { "iso-8859-5", &codepage::ISO_8859_5 },
ISO_8859_6 { "iso-8859-6", &codepage::ISO_8859_6 },
ISO_8859_7 { "iso-8859-7", &codepage::ISO_8859_7 },
ISO_8859_8 { "iso-8859-8", &codepage::ISO_8859_8 },
ISO_8859_9 { "iso-8859-9", &codepage::ISO_8859_9 },
US_ASCII { "us-ascii", &codepage::US_ASCII },
}
<*
Returns a CodePage for the given charset name.
@param [in] charset_name : "A name, case insensitive, using _ or - for separator"
@return "The CodePage for the name"
@return? NOT_FOUND : "If the charset is unknown or unsupported"
*>
fn CodePage? by_name(String charset_name) => @pool()
{
String name = charset_name.treplace("_","-");
name.convert_to_lower();
foreach (page : CodePage.values)
{
if (page.name == charset_name) return page;
}
return NOT_FOUND~;
}
fn String? decode(Allocator allocator, char[] src, CodePage code_page)
{
char[] dst = allocator::alloc_array(allocator, char, decode_len(src, code_page));
return decode_buffer(src, dst, code_page);
}
<*
Decode a code-page byte buffer into a UTF8 string.
@param src : "Input byte array in the given code page."
@param dst : "Destination output string in UTF-8."
@param code_page : "Code page for this encoding."
@return "String in UTF-8."
*>
fn String? decode_buffer(char[] src, char[] dst, CodePage code_page)
{
usz n = 0;
CodePageTable *table = code_page.table;
foreach (c: src)
{
usz pos = (usz)c * 4;
char len = table.to_codepoint[pos];
dst[n:len] = table.to_codepoint[pos+1:len];
n += len;
}
return (String)dst[:n];
}
fn char[]? encode(Allocator allocator, char[] src, CodePage code_page, char replacement = REPLACEMENT_CHAR)
{
char[] dst = allocator::alloc_array(allocator, char, encode_len(src));
return encode_buffer(src, dst, code_page, replacement);
}
const uint MASK @private = (1u << 24) - 1;
<*
Encode a UTF8 string into a singlebyte code page.
@param src : "Input byte array in UTF-8"
@param dst : "Destination output byte array in the target code page"
@param code_page : "Code page for this encoding."
@param replacement : "Byte to emit when Unicode scalar cannot be represented in the target code page."
@return "Byte array in the given code page."
*>
fn char[]? encode_buffer(char[] src, char[] dst, CodePage code_page, char replacement = REPLACEMENT_CHAR)
{
// Unpack the packed reverse table once into a local uint[256] view.
uint[256] from_map;
CodePageTable *table = code_page.table;
for (usz i = 0; i < 256; i++)
{
UIntLE *val = (UIntLE*)&table.from_codepoint[i * 4];
from_map[i] = mem::load(val, 1).val;
}
usz out = 0;
usz n = src.len;
for (usz i = 0; i < n; )
{
usz rem = n - i;
if (rem > 4) rem = 4;
Char32 codepoint = conv::utf8_to_char32(&src[i], &rem)!;
i += rem;
// Binary search for codepoint in low 24 bits of each entry.
// Returned index is between [0..from_map.len).
usz index = sort::binarysearch(from_map[..], (uint)codepoint, fn int(uint lhs, uint rhs) => (int)(lhs & MASK) - (int)(rhs & MASK));
uint entry = from_map[index];
if ((entry & MASK) == (uint)codepoint)
{
char b = (char)(entry >> 24);
dst[out++] = b;
}
else
{
dst[out++] = replacement;
}
}
return dst[:out];
}
<*
Compute the number of UTF8 bytes produced when decoding src with the given
code page table.
@param src : "Input byte array in the given code page."
@param code_page : "Code page for this encoding."
*>
fn usz decode_len(char[] src, CodePage code_page) @inline
{
usz n;
CodePageTable *table = code_page.table;
foreach (usz c: src) n += table.to_codepoint[c *4 ];
return n;
}
<*
Compute the number of output bytes produced when
encoding src from UTF8 to a singlebyte code page.
@param src : "Input byte array in UTF-8"
*>
fn usz encode_len(char[] src) @inline
{
return conv::utf8_codepoints((String)src);
}