From d46733e11adf80aad04357ccc8d7c5695434ba07 Mon Sep 17 00:00:00 2001 From: Disheng Su Date: Sun, 29 Jun 2025 11:11:11 -0700 Subject: [PATCH] Add string escaping and unescaping functionality (#2243) * Add `String.escape`, `String.unescape` for escaping and unescaping a string. --------- Co-authored-by: Christoffer Lerno --- lib/std/core/string_escape.c3 | 233 +++++++++++++++++++++++++ releasenotes.md | 1 + test/unit/stdlib/core/string_escape.c3 | 204 ++++++++++++++++++++++ 3 files changed, 438 insertions(+) create mode 100644 lib/std/core/string_escape.c3 create mode 100644 test/unit/stdlib/core/string_escape.c3 diff --git a/lib/std/core/string_escape.c3 b/lib/std/core/string_escape.c3 new file mode 100644 index 000000000..a54a36c01 --- /dev/null +++ b/lib/std/core/string_escape.c3 @@ -0,0 +1,233 @@ +// Copyright (c) 2024 Christoffer Lerno. All rights reserved. +// Use of this source code is governed by the MIT license +// a copy of which can be found in the LICENSE_STDLIB file. + +<* + This module provides functionality for escaping and unescaping strings + with standard C-style escape sequences, similar to what's used in JSON + and other string literals. +*> +module std::core::string; +import std::io; + +faultdef INVALID_ESCAPE_SEQUENCE, UNTERMINATED_STRING, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE; + +<* + Escape a string by adding quotes and converting special characters to escape sequences. + + @param allocator : "The allocator to use for the result" + @param s : "The string to escape" + @param strip_quotes : "Do not include beginning and end quotes, defaults to false" + @return "The escaped string with surrounding quotes, can safely be cast to ZString" +*> +fn String String.escape(String s, Allocator allocator, bool strip_quotes = true) +{ + // Conservative allocation: most strings need minimal escaping + usz initial_capacity = s.len + s.len / 5 + 2; // ~1.2x + quotes + DString result = dstring::new_with_capacity(allocator, initial_capacity); + + if (!strip_quotes) result.append_char('"'); + + foreach (char c : s) + { + switch (c) + { + case '"': result.append(`\"`); + case '\\': result.append(`\\`); + case '\b': result.append(`\b`); + case '\f': result.append(`\f`); + case '\n': result.append(`\n`); + case '\r': result.append(`\r`); + case '\t': result.append(`\t`); + case '\v': result.append(`\v`); + case '\0': result.append(`\0`); + default: + if (c >= 32 && c <= 126) + { + // Printable ASCII + result.append_char(c); + } + else + { + // Non-printable, use hex escape + result.appendf("\\x%02x", (uint)c); + } + } + } + + if (!strip_quotes) result.append_char('"'); + return result.copy_str(allocator); +} + +<* + Escape a string using the temp allocator. + + @param s : "The string to escape" + @param strip_quotes : "Do not include beginning and end quotes, defaults to false" + @return "The escaped string with surrounding quotes" +*> +fn String String.tescape(String s, bool strip_quotes = false) => s.escape(tmem, strip_quotes); + +<* + Calculate the length needed for an escaped string (including quotes). + + @param s : "The string to check" + @return "The length needed for the escaped version" +*> +fn usz escape_len(String s) +{ + usz len = 2; // For quotes + foreach (char c : s) + { + switch (c) + { + case '"': + case '\\': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': + case '\0': + len += 2; // \X + default: + if (c >= 32 && c <= 126) + { + len += 1; + } + else + { + len += 4; // \xHH + } + } + } + return len; +} + +<* + Unescape a quoted string by parsing escape sequences. + + @param allocator : "The allocator to use for the result" + @param s : "The quoted string to unescape" + @param allow_unquoted : "Set to true to unescape strings not surrounded by quotes, defaults to false" + @return "The unescaped string without quotes, safe to convert to ZString" + @return? UNTERMINATED_STRING, INVALID_ESCAPE_SEQUENCE, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE +*> +fn String? String.unescape(String s, Allocator allocator, bool allow_unquoted = false) +{ + if (s.len >= 2 && s[0] == '"' && s[^1] == '"') + { + // Remove quotes. + s = s[1:^2]; + } + else if (!allow_unquoted) return UNTERMINATED_STRING?; + + // Handle empty string case + if (!s.len) + { + return "".copy(allocator); + } + + DString result = dstring::new_with_capacity(allocator, s.len); + + usz len = s.len; + for (usz i = 0; i < len; i++) + { + char c = s[i]; + if (c != '\\') + { + result.append_char(c); + continue; + } + + // Handle escape sequence + if (i + 1 >= len) return INVALID_ESCAPE_SEQUENCE?; + + char escape_char = s[++i]; + switch (escape_char) + { + case '"': result.append_char('"'); + case '\\': result.append_char('\\'); + case '/': result.append_char('/'); + case 'b': result.append_char('\b'); + case 'f': result.append_char('\f'); + case 'n': result.append_char('\n'); + case 'r': result.append_char('\r'); + case 't': result.append_char('\t'); + case 'v': result.append_char('\v'); + case '0': result.append_char('\0'); + case 'x': + // Hex escape \xHH + if (i + 2 >= len) return INVALID_HEX_ESCAPE?; + char h1 = s[++i]; + char h2 = s[++i]; + if (!h1.is_xdigit() || !h2.is_xdigit()) return INVALID_HEX_ESCAPE?; + uint val = h1 > '9' ? (h1 | 32) - 'a' + 10 : h1 - '0'; + val = val << 4; + val += h2 > '9' ? (h2 | 32) - 'a' + 10 : h2 - '0'; + result.append_char((char)val); + case 'u': + // Unicode escape \uHHHH + if (i + 4 >= len) return INVALID_UNICODE_ESCAPE?; + uint val; + for (int j = 0; j < 4; j++) + { + char hex_char = s[++i]; + if (!hex_char.is_xdigit()) return INVALID_UNICODE_ESCAPE?; + val = val << 4 + (hex_char > '9' ? (hex_char | 32) - 'a' + 10 : hex_char - '0'); + } + result.append_char32(val); + case 'U': + // Unicode escape \UHHHHHHHH + if (i + 8 >= len) return INVALID_UNICODE_ESCAPE?; + uint val; + for (int j = 0; j < 8; j++) + { + char hex_char = s[++i]; + if (!hex_char.is_xdigit()) return INVALID_UNICODE_ESCAPE?; + val = val << 4 + (hex_char > '9' ? (hex_char | 32) - 'a' + 10 : hex_char - '0'); + } + result.append_char32(val); + default: + return INVALID_ESCAPE_SEQUENCE?; + } + } + + return result.copy_str(allocator); +} + +<* + Unescape a quoted string using the temp allocator. + + @param s : "The quoted string to unescape" + @param allow_unquoted : "Set to true to unescape strings not surrounded by quotes, defaults to false" + @return "The unescaped string without quotes" + @return? UNTERMINATED_STRING, INVALID_ESCAPE_SEQUENCE, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE +*> +fn String? String.tunescape(String s, bool allow_unquoted = false) => s.unescape(tmem, allow_unquoted); + +<* + Check if a character needs to be escaped in a string literal. + + @param c : "The character to check" + @return "True if the character needs escaping" +*> +fn bool needs_escape(char c) +{ + switch (c) + { + case '"': + case '\\': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + case '\v': + case '\0': + return true; + default: + return c < 32 || c > 126; + } +} \ No newline at end of file diff --git a/releasenotes.md b/releasenotes.md index ab94dc777..2a1bba949 100644 --- a/releasenotes.md +++ b/releasenotes.md @@ -79,6 +79,7 @@ - `is_array_or_slice_of_char` and `is_arrayptr_or_slice_of_char` are replaced by constant `@` variants. - `@pool` now has an optional `reserve` parameter, some minor changes to the temp_allocator API - io::struct_to_format now supports bitstructs. +- Add `String.escape`, `String.unescape` for escaping and unescaping a string. ## 0.7.2 Change list diff --git a/test/unit/stdlib/core/string_escape.c3 b/test/unit/stdlib/core/string_escape.c3 new file mode 100644 index 000000000..8b685e103 --- /dev/null +++ b/test/unit/stdlib/core/string_escape.c3 @@ -0,0 +1,204 @@ +module std::core::test::string::test @test; + +struct EscapeTest +{ + String input; + String expected_escaped; + String expected_content_escaped; +} + +EscapeTest[] escape_tests = { + // Basic strings + { "hello", `"hello"`, "hello" }, + { "", `""`, "" }, + + // Special characters that need escaping + { "hello\"world", `"hello\"world"`, `hello\"world` }, + { "path\\to\\file", `"path\\to\\file"`, `path\\to\\file` }, + { "line1\nline2", `"line1\nline2"`, `line1\nline2` }, + { "tab\there", `"tab\there"`, `tab\there` }, + { "carriage\rreturn", `"carriage\rreturn"`, `carriage\rreturn` }, + { "backspace\bchar", `"backspace\bchar"`, `backspace\bchar` }, + { "form\ffeed", `"form\ffeed"`, `form\ffeed` }, + { "vertical\vtab", `"vertical\vtab"`, `vertical\vtab` }, + { "null\0char", `"null\0char"`, `null\0char` }, + + // Non-printable characters (should use hex escapes) + { "\x01\x1f\x7f", `"\x01\x1f\x7f"`, `\x01\x1f\x7f` }, + + // Mixed content + { "Hello\nWorld\t!", `"Hello\nWorld\t!"`, `Hello\nWorld\t!` }, + { "Quote: \"Hello\"", `"Quote: \"Hello\""`, `Quote: \"Hello\"` }, +}; + +struct UnescapeTest +{ + String input; + String expected; + fault expected_error; +} + +UnescapeTest[] unescape_tests = { + // Valid cases + { `"hello"`, "hello", {} }, + { `""`, "", {} }, + { `"hello\"world"`, "hello\"world", {} }, + { `"path\\to\\file"`, "path\\to\\file", {} }, + { `"line1\nline2"`, "line1\nline2", {} }, + { `"tab\there"`, "tab\there", {} }, + { `"carriage\rreturn"`, "carriage\rreturn", {} }, + { `"backspace\bchar"`, "backspace\bchar", {} }, + { `"form\ffeed"`, "form\ffeed", {} }, + { `"vertical\vtab"`, "vertical\vtab", {} }, + { `"null\0char"`, "null\0char", {} }, + { `"slash\/works"`, "slash/works", {} }, + + // Hex escapes + { `"\x41\x42\x43"`, "ABC", {} }, + { `"\x00\x1f\x7f"`, "\x00\x1f\x7f", {} }, + + // Unicode escapes + { `"\u0041\u0042\u0043"`, "ABC", {} }, + { `"\u2603"`, "☃", {} }, // Snowman + { `"\U0001F600"`, "😀", {} }, // Grinning face emoji + + // Error cases + { `"unterminated`, "", string::UNTERMINATED_STRING }, + { `unterminated"`, "", string::UNTERMINATED_STRING }, + { `"invalid\q"`, "", string::INVALID_ESCAPE_SEQUENCE }, + { `"incomplete\"`, "", string::INVALID_ESCAPE_SEQUENCE }, + { `"bad\x"`, "", string::INVALID_HEX_ESCAPE }, + { `"bad\xG1"`, "", string::INVALID_HEX_ESCAPE }, + { `"bad\u"`, "", string::INVALID_UNICODE_ESCAPE }, + { `"bad\uGGGG"`, "", string::INVALID_UNICODE_ESCAPE }, + { `"bad\U"`, "", string::INVALID_UNICODE_ESCAPE }, + { `"bad\UGGGGGGGG"`, "", string::INVALID_UNICODE_ESCAPE }, +}; + +fn void test_escape() +{ + foreach (test : escape_tests) + { + String result = test.input.tescape(); + assert(result == test.expected_escaped, + "escape(%s) = %s, expected %s", + test.input, result, test.expected_escaped); + } +} + +fn void test_escape_content() +{ + foreach (test : escape_tests) + { + String result = test.input.tescape(strip_quotes: true); + assert(result == test.expected_content_escaped, + "escape_content(%s) = %s, expected %s", + test.input, result, test.expected_content_escaped); + } +} + +fn void test_unescape() +{ + foreach (test : unescape_tests) + { + String? result = test.input.tunescape(); + + if (test.expected_error) + { + // Expecting an error + if (catch err = result) + { + assert(err == test.expected_error, + "unescape(%s) failed with %s, expected %s", + test.input, err, test.expected_error); + } + else + { + assert(false, "unescape(%s) should have failed with %s", + test.input, test.expected_error); + } + } + else + { + // Expecting success + if (try actual = result) + { + assert(actual == test.expected, + "unescape(%s) = %s, expected %s", + test.input, actual, test.expected); + } + else + { + assert(false, "unescape(%s) failed unexpectedly", test.input); + } + } + } +} + +fn void test_roundtrip() +{ + String[] test_strings = { + "hello world", + "special chars: \n\t\r\"\\", + "unicode: ☃ 😀", + "mixed: Hello\nWorld\t!", + "", + "\x00\x01\x1f\x7f", + }; + + foreach (original : test_strings) + { + String escaped = original.tescape(); + String? unescaped = escaped.tunescape(); + + if (try actual = unescaped) + { + assert(actual == original, + "roundtrip failed for %s: got %s", + original, actual); + } + else + { + assert(false, "roundtrip failed for %s: couldn't unescape %s", + original, escaped); + } + } +} + +fn void test_needs_escape() +{ + // Characters that need escaping + assert(string::needs_escape('"')); + assert(string::needs_escape('\\')); + assert(string::needs_escape('\n')); + assert(string::needs_escape('\t')); + assert(string::needs_escape('\r')); + assert(string::needs_escape('\b')); + assert(string::needs_escape('\f')); + assert(string::needs_escape('\v')); + assert(string::needs_escape('\0')); + assert(string::needs_escape('\x01')); + assert(string::needs_escape('\x1f')); + assert(string::needs_escape('\x7f')); + + // Characters that don't need escaping + assert(!string::needs_escape('a')); + assert(!string::needs_escape('Z')); + assert(!string::needs_escape('0')); + assert(!string::needs_escape('9')); + assert(!string::needs_escape(' ')); + assert(!string::needs_escape('!')); + assert(!string::needs_escape('~')); +} + +fn void test_escape_len() +{ + foreach (test : escape_tests) + { + usz calculated_len = string::escape_len(test.input); + usz actual_len = test.expected_escaped.len; + assert(calculated_len == actual_len, + "escape_len(%s) = %d, but actual escaped length is %d", + test.input, calculated_len, actual_len); + } +}