c3c/lib/std/core/string_escape.c3

// Copyright (c) 2024 Christoffer Lerno. All rights reserved.
// Use of this source code is governed by the MIT license
// a copy of which can be found in the LICENSE_STDLIB file.

<*
 This module provides functionality for escaping and unescaping strings
 with standard C-style escape sequences, similar to what's used in JSON
 and other string literals.
*>
module std::core::string;
import std::io;

faultdef INVALID_ESCAPE_SEQUENCE, UNTERMINATED_STRING, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE;

<*
 Escape a string by adding quotes and converting special characters to escape sequences.

 @param allocator : "The allocator to use for the result"
 @param s : "The string to escape"
 @param strip_quotes : "Do not include beginning and end quotes, defaults to false"
 @return "The escaped string with surrounding quotes, can safely be cast to ZString"
*>
fn String String.escape(String s, Allocator allocator, bool strip_quotes = true)
{
    // Conservative allocation: most strings need minimal escaping
    usz initial_capacity = s.len + s.len / 5 + 2; // ~1.2x + quotes
    DString result = dstring::new_with_capacity(allocator, initial_capacity);

    if (!strip_quotes) result.append_char('"');

    foreach (char c : s)
    {
        switch (c)
        {
            case '"':  result.append(`\"`);
            case '\\': result.append(`\\`);
            case '\b': result.append(`\b`);
            case '\f': result.append(`\f`);
            case '\n': result.append(`\n`);
            case '\r': result.append(`\r`);
            case '\t': result.append(`\t`);
            case '\v': result.append(`\v`);
            case '\0': result.append(`\0`);
            default:
                if (c >= 32 && c <= 126)
                {
                    // Printable ASCII
                    result.append_char(c);
                }
                else
                {
                    // Non-printable, use hex escape
                    result.appendf("\\x%02x", (uint)c);
                }
        }
    }

    if (!strip_quotes) result.append_char('"');
    return result.copy_str(allocator);
}

<*
 Escape a string using the temp allocator.

 @param s : "The string to escape"
 @param strip_quotes : "Do not include beginning and end quotes, defaults to false"
 @return "The escaped string with surrounding quotes"
*>
fn String String.tescape(String s, bool strip_quotes = false) => s.escape(tmem, strip_quotes);

<*
 Calculate the length needed for an escaped string (including quotes).

 @param s : "The string to check"
 @return "The length needed for the escaped version"
*>
fn usz escape_len(String s)
{
    usz len = 2; // For quotes
    foreach (char c : s)
    {
        switch (c)
        {
            case '"':
            case '\\':
            case '\b':
            case '\f':
            case '\n':
            case '\r':
            case '\t':
            case '\v':
            case '\0':
                len += 2; // \X
            default:
                if (c >= 32 && c <= 126)
                {
                    len += 1;
                }
                else
                {
                    len += 4; // \xHH
                }
        }
    }
    return len;
}

<*
 Unescape a quoted string by parsing escape sequences.

 @param allocator : "The allocator to use for the result"
 @param s : "The quoted string to unescape"
 @param allow_unquoted : "Set to true to unescape strings not surrounded by quotes, defaults to false"
 @return "The unescaped string without quotes, safe to convert to ZString"
 @return? UNTERMINATED_STRING, INVALID_ESCAPE_SEQUENCE, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE
*>
fn String? String.unescape(String s, Allocator allocator, bool allow_unquoted = false)
{
	if (s.len >= 2 && s[0] == '"' && s[^1] == '"')
	{
		// Remove quotes.
		s = s[1:^2];
	}
	else if (!allow_unquoted) return UNTERMINATED_STRING?;

    // Handle empty string case
    if (!s.len)
    {
        return "".copy(allocator);
    }

    DString result = dstring::new_with_capacity(allocator, s.len);

	usz len = s.len;
	for (usz i = 0; i < len; i++)
	{
		char c = s[i];
        if (c != '\\')
        {
            result.append_char(c);
            continue;
        }

        // Handle escape sequence
        if (i + 1 >= len) return INVALID_ESCAPE_SEQUENCE?;

        char escape_char = s[++i];
        switch (escape_char)
        {
            case '"':  result.append_char('"');
            case '\\': result.append_char('\\');
            case '/':  result.append_char('/');
            case 'b':  result.append_char('\b');
            case 'f':  result.append_char('\f');
            case 'n':  result.append_char('\n');
            case 'r':  result.append_char('\r');
            case 't':  result.append_char('\t');
            case 'v':  result.append_char('\v');
            case '0':  result.append_char('\0');
            case 'x':
                // Hex escape \xHH
                if (i + 2 >= len) return INVALID_HEX_ESCAPE?;
                char h1 = s[++i];
                char h2 = s[++i];
                if (!h1.is_xdigit() || !h2.is_xdigit()) return INVALID_HEX_ESCAPE?;
                uint val = h1 > '9' ? (h1 | 32) - 'a' + 10 : h1 - '0';
                val = val << 4;
                val += h2 > '9' ? (h2 | 32) - 'a' + 10 : h2 - '0';
                result.append_char((char)val);
            case 'u':
                // Unicode escape \uHHHH
                if (i + 4 >= len) return INVALID_UNICODE_ESCAPE?;
                uint val;
                for (int j = 0; j < 4; j++)
                {
                    char hex_char = s[++i];
                    if (!hex_char.is_xdigit()) return INVALID_UNICODE_ESCAPE?;
                    val = val << 4 + (hex_char > '9' ? (hex_char | 32) - 'a' + 10 : hex_char - '0');
                }
                result.append_char32(val);
            case 'U':
                // Unicode escape \UHHHHHHHH
                if (i + 8 >= len) return INVALID_UNICODE_ESCAPE?;
                uint val;
                for (int j = 0; j < 8; j++)
                {
                    char hex_char = s[++i];
                    if (!hex_char.is_xdigit()) return INVALID_UNICODE_ESCAPE?;
                    val = val << 4 + (hex_char > '9' ? (hex_char | 32) - 'a' + 10 : hex_char - '0');
                }
                result.append_char32(val);
            default:
                return INVALID_ESCAPE_SEQUENCE?;
        }
    }

    return result.copy_str(allocator);
}

<*
 Unescape a quoted string using the temp allocator.

 @param s : "The quoted string to unescape"
 @param allow_unquoted : "Set to true to unescape strings not surrounded by quotes, defaults to false"
 @return "The unescaped string without quotes"
 @return? UNTERMINATED_STRING, INVALID_ESCAPE_SEQUENCE, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE
*>
fn String? String.tunescape(String s, bool allow_unquoted = false) => s.unescape(tmem, allow_unquoted);

<*
 Check if a character needs to be escaped in a string literal.

 @param c : "The character to check"
 @return "True if the character needs escaping"
*>
fn bool needs_escape(char c)
{
    switch (c)
    {
        case '"':
        case '\\':
        case '\b':
        case '\f':
        case '\n':
        case '\r':
        case '\t':
        case '\v':
        case '\0':
            return true;
        default:
            return c < 32 || c > 126;
    }
}