Add string escaping and unescaping functionality (#2243)

* Add `String.escape`, `String.unescape` for escaping and unescaping a string. --------- Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
2026-02-27 12:01:16 +00:00 · 2025-06-29 11:11:11 -07:00
parent ce569462f6
commit d46733e11a
3 changed files with 438 additions and 0 deletions
--- a/lib/std/core/string_escape.c3
+++ b/lib/std/core/string_escape.c3
@@ -0,0 +1,233 @@
+// Copyright (c) 2024 Christoffer Lerno. All rights reserved.
+// Use of this source code is governed by the MIT license
+// a copy of which can be found in the LICENSE_STDLIB file.
+
+<*
+ This module provides functionality for escaping and unescaping strings
+ with standard C-style escape sequences, similar to what's used in JSON
+ and other string literals.
+*>
+module std::core::string;
+import std::io;
+
+faultdef INVALID_ESCAPE_SEQUENCE, UNTERMINATED_STRING, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE;
+
+<*
+ Escape a string by adding quotes and converting special characters to escape sequences.
+
+ @param allocator : "The allocator to use for the result"
+ @param s : "The string to escape"
+ @param strip_quotes : "Do not include beginning and end quotes, defaults to false"
+ @return "The escaped string with surrounding quotes, can safely be cast to ZString"
+*>
+fn String String.escape(String s, Allocator allocator, bool strip_quotes = true)
+{
+    // Conservative allocation: most strings need minimal escaping
+    usz initial_capacity = s.len + s.len / 5 + 2; // ~1.2x + quotes
+    DString result = dstring::new_with_capacity(allocator, initial_capacity);
+
+    if (!strip_quotes) result.append_char('"');
+
+    foreach (char c : s)
+    {
+        switch (c)
+        {
+            case '"':  result.append(`\"`);
+            case '\\': result.append(`\\`);
+            case '\b': result.append(`\b`);
+            case '\f': result.append(`\f`);
+            case '\n': result.append(`\n`);
+            case '\r': result.append(`\r`);
+            case '\t': result.append(`\t`);
+            case '\v': result.append(`\v`);
+            case '\0': result.append(`\0`);
+            default:
+                if (c >= 32 && c <= 126)
+                {
+                    // Printable ASCII
+                    result.append_char(c);
+                }
+                else
+                {
+                    // Non-printable, use hex escape
+                    result.appendf("\\x%02x", (uint)c);
+                }
+        }
+    }
+
+    if (!strip_quotes) result.append_char('"');
+    return result.copy_str(allocator);
+}
+
+<*
+ Escape a string using the temp allocator.
+
+ @param s : "The string to escape"
+ @param strip_quotes : "Do not include beginning and end quotes, defaults to false"
+ @return "The escaped string with surrounding quotes"
+*>
+fn String String.tescape(String s, bool strip_quotes = false) => s.escape(tmem, strip_quotes);
+
+<*
+ Calculate the length needed for an escaped string (including quotes).
+
+ @param s : "The string to check"
+ @return "The length needed for the escaped version"
+*>
+fn usz escape_len(String s)
+{
+    usz len = 2; // For quotes
+    foreach (char c : s)
+    {
+        switch (c)
+        {
+            case '"':
+            case '\\':
+            case '\b':
+            case '\f':
+            case '\n':
+            case '\r':
+            case '\t':
+            case '\v':
+            case '\0':
+                len += 2; // \X
+            default:
+                if (c >= 32 && c <= 126)
+                {
+                    len += 1;
+                }
+                else
+                {
+                    len += 4; // \xHH
+                }
+        }
+    }
+    return len;
+}
+
+<*
+ Unescape a quoted string by parsing escape sequences.
+
+ @param allocator : "The allocator to use for the result"
+ @param s : "The quoted string to unescape"
+ @param allow_unquoted : "Set to true to unescape strings not surrounded by quotes, defaults to false"
+ @return "The unescaped string without quotes, safe to convert to ZString"
+ @return? UNTERMINATED_STRING, INVALID_ESCAPE_SEQUENCE, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE
+*>
+fn String? String.unescape(String s, Allocator allocator, bool allow_unquoted = false)
+{
+	if (s.len >= 2 && s[0] == '"' && s[^1] == '"')
+	{
+		// Remove quotes.
+		s = s[1:^2];
+	}
+	else if (!allow_unquoted) return UNTERMINATED_STRING?;
+
+    // Handle empty string case
+    if (!s.len)
+    {
+        return "".copy(allocator);
+    }
+
+    DString result = dstring::new_with_capacity(allocator, s.len);
+
+	usz len = s.len;
+	for (usz i = 0; i < len; i++)
+	{
+		char c = s[i];
+        if (c != '\\')
+        {
+            result.append_char(c);
+            continue;
+        }
+
+        // Handle escape sequence
+        if (i + 1 >= len) return INVALID_ESCAPE_SEQUENCE?;
+
+        char escape_char = s[++i];
+        switch (escape_char)
+        {
+            case '"':  result.append_char('"');
+            case '\\': result.append_char('\\');
+            case '/':  result.append_char('/');
+            case 'b':  result.append_char('\b');
+            case 'f':  result.append_char('\f');
+            case 'n':  result.append_char('\n');
+            case 'r':  result.append_char('\r');
+            case 't':  result.append_char('\t');
+            case 'v':  result.append_char('\v');
+            case '0':  result.append_char('\0');
+            case 'x':
+                // Hex escape \xHH
+                if (i + 2 >= len) return INVALID_HEX_ESCAPE?;
+                char h1 = s[++i];
+                char h2 = s[++i];
+                if (!h1.is_xdigit() || !h2.is_xdigit()) return INVALID_HEX_ESCAPE?;
+                uint val = h1 > '9' ? (h1 | 32) - 'a' + 10 : h1 - '0';
+                val = val << 4;
+                val += h2 > '9' ? (h2 | 32) - 'a' + 10 : h2 - '0';
+                result.append_char((char)val);
+            case 'u':
+                // Unicode escape \uHHHH
+                if (i + 4 >= len) return INVALID_UNICODE_ESCAPE?;
+                uint val;
+                for (int j = 0; j < 4; j++)
+                {
+                    char hex_char = s[++i];
+                    if (!hex_char.is_xdigit()) return INVALID_UNICODE_ESCAPE?;
+                    val = val << 4 + (hex_char > '9' ? (hex_char | 32) - 'a' + 10 : hex_char - '0');
+                }
+                result.append_char32(val);
+            case 'U':
+                // Unicode escape \UHHHHHHHH
+                if (i + 8 >= len) return INVALID_UNICODE_ESCAPE?;
+                uint val;
+                for (int j = 0; j < 8; j++)
+                {
+                    char hex_char = s[++i];
+                    if (!hex_char.is_xdigit()) return INVALID_UNICODE_ESCAPE?;
+                    val = val << 4 + (hex_char > '9' ? (hex_char | 32) - 'a' + 10 : hex_char - '0');
+                }
+                result.append_char32(val);
+            default:
+                return INVALID_ESCAPE_SEQUENCE?;
+        }
+    }
+
+    return result.copy_str(allocator);
+}
+
+<*
+ Unescape a quoted string using the temp allocator.
+
+ @param s : "The quoted string to unescape"
+ @param allow_unquoted : "Set to true to unescape strings not surrounded by quotes, defaults to false"
+ @return "The unescaped string without quotes"
+ @return? UNTERMINATED_STRING, INVALID_ESCAPE_SEQUENCE, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE
+*>
+fn String? String.tunescape(String s, bool allow_unquoted = false) => s.unescape(tmem, allow_unquoted);
+
+<*
+ Check if a character needs to be escaped in a string literal.
+
+ @param c : "The character to check"
+ @return "True if the character needs escaping"
+*>
+fn bool needs_escape(char c)
+{
+    switch (c)
+    {
+        case '"':
+        case '\\':
+        case '\b':
+        case '\f':
+        case '\n':
+        case '\r':
+        case '\t':
+        case '\v':
+        case '\0':
+            return true;
+        default:
+            return c < 32 || c > 126;
+    }
+}
--- a/releasenotes.md
+++ b/releasenotes.md
@@ -79,6 +79,7 @@
 - `is_array_or_slice_of_char` and `is_arrayptr_or_slice_of_char` are replaced by constant `@` variants.
 - `@pool` now has an optional `reserve` parameter, some minor changes to the temp_allocator API
 - io::struct_to_format now supports bitstructs.
+- Add `String.escape`, `String.unescape` for escaping and unescaping a string.

 ## 0.7.2 Change list

--- a/test/unit/stdlib/core/string_escape.c3
+++ b/test/unit/stdlib/core/string_escape.c3
@@ -0,0 +1,204 @@
+module std::core::test::string::test @test;
+
+struct EscapeTest
+{
+    String input;
+    String expected_escaped;
+    String expected_content_escaped;
+}
+
+EscapeTest[] escape_tests = {
+    // Basic strings
+    { "hello", `"hello"`, "hello" },
+    { "", `""`, "" },
+
+    // Special characters that need escaping
+    { "hello\"world", `"hello\"world"`, `hello\"world` },
+    { "path\\to\\file", `"path\\to\\file"`, `path\\to\\file` },
+    { "line1\nline2", `"line1\nline2"`, `line1\nline2` },
+    { "tab\there", `"tab\there"`, `tab\there` },
+    { "carriage\rreturn", `"carriage\rreturn"`, `carriage\rreturn` },
+    { "backspace\bchar", `"backspace\bchar"`, `backspace\bchar` },
+    { "form\ffeed", `"form\ffeed"`, `form\ffeed` },
+    { "vertical\vtab", `"vertical\vtab"`, `vertical\vtab` },
+    { "null\0char", `"null\0char"`, `null\0char` },
+
+    // Non-printable characters (should use hex escapes)
+    { "\x01\x1f\x7f", `"\x01\x1f\x7f"`, `\x01\x1f\x7f` },
+
+    // Mixed content
+    { "Hello\nWorld\t!", `"Hello\nWorld\t!"`, `Hello\nWorld\t!` },
+    { "Quote: \"Hello\"", `"Quote: \"Hello\""`, `Quote: \"Hello\"` },
+};
+
+struct UnescapeTest
+{
+    String input;
+    String expected;
+    fault expected_error;
+}
+
+UnescapeTest[] unescape_tests = {
+    // Valid cases
+    { `"hello"`, "hello", {} },
+    { `""`, "", {} },
+    { `"hello\"world"`, "hello\"world", {} },
+    { `"path\\to\\file"`, "path\\to\\file", {} },
+    { `"line1\nline2"`, "line1\nline2", {} },
+    { `"tab\there"`, "tab\there", {} },
+    { `"carriage\rreturn"`, "carriage\rreturn", {} },
+    { `"backspace\bchar"`, "backspace\bchar", {} },
+    { `"form\ffeed"`, "form\ffeed", {} },
+    { `"vertical\vtab"`, "vertical\vtab", {} },
+    { `"null\0char"`, "null\0char", {} },
+    { `"slash\/works"`, "slash/works", {} },
+
+    // Hex escapes
+    { `"\x41\x42\x43"`, "ABC", {} },
+    { `"\x00\x1f\x7f"`, "\x00\x1f\x7f", {} },
+
+    // Unicode escapes
+    { `"\u0041\u0042\u0043"`, "ABC", {} },
+    { `"\u2603"`, "☃", {} }, // Snowman
+    { `"\U0001F600"`, "😀", {} }, // Grinning face emoji
+
+    // Error cases
+    { `"unterminated`, "", string::UNTERMINATED_STRING },
+    { `unterminated"`, "", string::UNTERMINATED_STRING },
+    { `"invalid\q"`, "", string::INVALID_ESCAPE_SEQUENCE },
+    { `"incomplete\"`, "", string::INVALID_ESCAPE_SEQUENCE },
+    { `"bad\x"`, "", string::INVALID_HEX_ESCAPE },
+    { `"bad\xG1"`, "", string::INVALID_HEX_ESCAPE },
+    { `"bad\u"`, "", string::INVALID_UNICODE_ESCAPE },
+    { `"bad\uGGGG"`, "", string::INVALID_UNICODE_ESCAPE },
+    { `"bad\U"`, "", string::INVALID_UNICODE_ESCAPE },
+    { `"bad\UGGGGGGGG"`, "", string::INVALID_UNICODE_ESCAPE },
+};
+
+fn void test_escape()
+{
+    foreach (test : escape_tests)
+    {
+        String result = test.input.tescape();
+        assert(result == test.expected_escaped,
+               "escape(%s) = %s, expected %s",
+               test.input, result, test.expected_escaped);
+    }
+}
+
+fn void test_escape_content()
+{
+    foreach (test : escape_tests)
+    {
+        String result = test.input.tescape(strip_quotes: true);
+        assert(result == test.expected_content_escaped,
+               "escape_content(%s) = %s, expected %s",
+               test.input, result, test.expected_content_escaped);
+    }
+}
+
+fn void test_unescape()
+{
+    foreach (test : unescape_tests)
+    {
+        String? result = test.input.tunescape();
+
+        if (test.expected_error)
+        {
+            // Expecting an error
+            if (catch err = result)
+            {
+                assert(err == test.expected_error,
+                       "unescape(%s) failed with %s, expected %s",
+                       test.input, err, test.expected_error);
+            }
+            else
+            {
+                assert(false, "unescape(%s) should have failed with %s",
+                       test.input, test.expected_error);
+            }
+        }
+        else
+        {
+            // Expecting success
+            if (try actual = result)
+            {
+                assert(actual == test.expected,
+                       "unescape(%s) = %s, expected %s",
+                       test.input, actual, test.expected);
+            }
+            else
+            {
+                assert(false, "unescape(%s) failed unexpectedly", test.input);
+            }
+        }
+    }
+}
+
+fn void test_roundtrip()
+{
+    String[] test_strings = {
+        "hello world",
+        "special chars: \n\t\r\"\\",
+        "unicode: ☃ 😀",
+        "mixed: Hello\nWorld\t!",
+        "",
+        "\x00\x01\x1f\x7f",
+    };
+
+    foreach (original : test_strings)
+    {
+        String escaped = original.tescape();
+        String? unescaped = escaped.tunescape();
+
+        if (try actual = unescaped)
+        {
+            assert(actual == original,
+                   "roundtrip failed for %s: got %s",
+                   original, actual);
+        }
+        else
+        {
+            assert(false, "roundtrip failed for %s: couldn't unescape %s",
+                   original, escaped);
+        }
+    }
+}
+
+fn void test_needs_escape()
+{
+    // Characters that need escaping
+    assert(string::needs_escape('"'));
+    assert(string::needs_escape('\\'));
+    assert(string::needs_escape('\n'));
+    assert(string::needs_escape('\t'));
+    assert(string::needs_escape('\r'));
+    assert(string::needs_escape('\b'));
+    assert(string::needs_escape('\f'));
+    assert(string::needs_escape('\v'));
+    assert(string::needs_escape('\0'));
+    assert(string::needs_escape('\x01'));
+    assert(string::needs_escape('\x1f'));
+    assert(string::needs_escape('\x7f'));
+
+    // Characters that don't need escaping
+    assert(!string::needs_escape('a'));
+    assert(!string::needs_escape('Z'));
+    assert(!string::needs_escape('0'));
+    assert(!string::needs_escape('9'));
+    assert(!string::needs_escape(' '));
+    assert(!string::needs_escape('!'));
+    assert(!string::needs_escape('~'));
+}
+
+fn void test_escape_len()
+{
+    foreach (test : escape_tests)
+    {
+        usz calculated_len = string::escape_len(test.input);
+        usz actual_len = test.expected_escaped.len;
+        assert(calculated_len == actual_len,
+               "escape_len(%s) = %d, but actual escaped length is %d",
+               test.input, calculated_len, actual_len);
+    }
+}