mirror of
https://github.com/c3lang/c3c.git
synced 2026-02-27 12:01:16 +00:00
Add string escaping and unescaping functionality (#2243)
* Add `String.escape`, `String.unescape` for escaping and unescaping a string. --------- Co-authored-by: Christoffer Lerno <christoffer@aegik.com>
This commit is contained in:
233
lib/std/core/string_escape.c3
Normal file
233
lib/std/core/string_escape.c3
Normal file
@@ -0,0 +1,233 @@
|
||||
// Copyright (c) 2024 Christoffer Lerno. All rights reserved.
|
||||
// Use of this source code is governed by the MIT license
|
||||
// a copy of which can be found in the LICENSE_STDLIB file.
|
||||
|
||||
<*
|
||||
This module provides functionality for escaping and unescaping strings
|
||||
with standard C-style escape sequences, similar to what's used in JSON
|
||||
and other string literals.
|
||||
*>
|
||||
module std::core::string;
|
||||
import std::io;
|
||||
|
||||
faultdef INVALID_ESCAPE_SEQUENCE, UNTERMINATED_STRING, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE;
|
||||
|
||||
<*
|
||||
Escape a string by adding quotes and converting special characters to escape sequences.
|
||||
|
||||
@param allocator : "The allocator to use for the result"
|
||||
@param s : "The string to escape"
|
||||
@param strip_quotes : "Do not include beginning and end quotes, defaults to false"
|
||||
@return "The escaped string with surrounding quotes, can safely be cast to ZString"
|
||||
*>
|
||||
fn String String.escape(String s, Allocator allocator, bool strip_quotes = true)
|
||||
{
|
||||
// Conservative allocation: most strings need minimal escaping
|
||||
usz initial_capacity = s.len + s.len / 5 + 2; // ~1.2x + quotes
|
||||
DString result = dstring::new_with_capacity(allocator, initial_capacity);
|
||||
|
||||
if (!strip_quotes) result.append_char('"');
|
||||
|
||||
foreach (char c : s)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case '"': result.append(`\"`);
|
||||
case '\\': result.append(`\\`);
|
||||
case '\b': result.append(`\b`);
|
||||
case '\f': result.append(`\f`);
|
||||
case '\n': result.append(`\n`);
|
||||
case '\r': result.append(`\r`);
|
||||
case '\t': result.append(`\t`);
|
||||
case '\v': result.append(`\v`);
|
||||
case '\0': result.append(`\0`);
|
||||
default:
|
||||
if (c >= 32 && c <= 126)
|
||||
{
|
||||
// Printable ASCII
|
||||
result.append_char(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Non-printable, use hex escape
|
||||
result.appendf("\\x%02x", (uint)c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!strip_quotes) result.append_char('"');
|
||||
return result.copy_str(allocator);
|
||||
}
|
||||
|
||||
<*
|
||||
Escape a string using the temp allocator.
|
||||
|
||||
@param s : "The string to escape"
|
||||
@param strip_quotes : "Do not include beginning and end quotes, defaults to false"
|
||||
@return "The escaped string with surrounding quotes"
|
||||
*>
|
||||
fn String String.tescape(String s, bool strip_quotes = false) => s.escape(tmem, strip_quotes);
|
||||
|
||||
<*
|
||||
Calculate the length needed for an escaped string (including quotes).
|
||||
|
||||
@param s : "The string to check"
|
||||
@return "The length needed for the escaped version"
|
||||
*>
|
||||
fn usz escape_len(String s)
|
||||
{
|
||||
usz len = 2; // For quotes
|
||||
foreach (char c : s)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case '"':
|
||||
case '\\':
|
||||
case '\b':
|
||||
case '\f':
|
||||
case '\n':
|
||||
case '\r':
|
||||
case '\t':
|
||||
case '\v':
|
||||
case '\0':
|
||||
len += 2; // \X
|
||||
default:
|
||||
if (c >= 32 && c <= 126)
|
||||
{
|
||||
len += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
len += 4; // \xHH
|
||||
}
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
<*
|
||||
Unescape a quoted string by parsing escape sequences.
|
||||
|
||||
@param allocator : "The allocator to use for the result"
|
||||
@param s : "The quoted string to unescape"
|
||||
@param allow_unquoted : "Set to true to unescape strings not surrounded by quotes, defaults to false"
|
||||
@return "The unescaped string without quotes, safe to convert to ZString"
|
||||
@return? UNTERMINATED_STRING, INVALID_ESCAPE_SEQUENCE, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE
|
||||
*>
|
||||
fn String? String.unescape(String s, Allocator allocator, bool allow_unquoted = false)
|
||||
{
|
||||
if (s.len >= 2 && s[0] == '"' && s[^1] == '"')
|
||||
{
|
||||
// Remove quotes.
|
||||
s = s[1:^2];
|
||||
}
|
||||
else if (!allow_unquoted) return UNTERMINATED_STRING?;
|
||||
|
||||
// Handle empty string case
|
||||
if (!s.len)
|
||||
{
|
||||
return "".copy(allocator);
|
||||
}
|
||||
|
||||
DString result = dstring::new_with_capacity(allocator, s.len);
|
||||
|
||||
usz len = s.len;
|
||||
for (usz i = 0; i < len; i++)
|
||||
{
|
||||
char c = s[i];
|
||||
if (c != '\\')
|
||||
{
|
||||
result.append_char(c);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle escape sequence
|
||||
if (i + 1 >= len) return INVALID_ESCAPE_SEQUENCE?;
|
||||
|
||||
char escape_char = s[++i];
|
||||
switch (escape_char)
|
||||
{
|
||||
case '"': result.append_char('"');
|
||||
case '\\': result.append_char('\\');
|
||||
case '/': result.append_char('/');
|
||||
case 'b': result.append_char('\b');
|
||||
case 'f': result.append_char('\f');
|
||||
case 'n': result.append_char('\n');
|
||||
case 'r': result.append_char('\r');
|
||||
case 't': result.append_char('\t');
|
||||
case 'v': result.append_char('\v');
|
||||
case '0': result.append_char('\0');
|
||||
case 'x':
|
||||
// Hex escape \xHH
|
||||
if (i + 2 >= len) return INVALID_HEX_ESCAPE?;
|
||||
char h1 = s[++i];
|
||||
char h2 = s[++i];
|
||||
if (!h1.is_xdigit() || !h2.is_xdigit()) return INVALID_HEX_ESCAPE?;
|
||||
uint val = h1 > '9' ? (h1 | 32) - 'a' + 10 : h1 - '0';
|
||||
val = val << 4;
|
||||
val += h2 > '9' ? (h2 | 32) - 'a' + 10 : h2 - '0';
|
||||
result.append_char((char)val);
|
||||
case 'u':
|
||||
// Unicode escape \uHHHH
|
||||
if (i + 4 >= len) return INVALID_UNICODE_ESCAPE?;
|
||||
uint val;
|
||||
for (int j = 0; j < 4; j++)
|
||||
{
|
||||
char hex_char = s[++i];
|
||||
if (!hex_char.is_xdigit()) return INVALID_UNICODE_ESCAPE?;
|
||||
val = val << 4 + (hex_char > '9' ? (hex_char | 32) - 'a' + 10 : hex_char - '0');
|
||||
}
|
||||
result.append_char32(val);
|
||||
case 'U':
|
||||
// Unicode escape \UHHHHHHHH
|
||||
if (i + 8 >= len) return INVALID_UNICODE_ESCAPE?;
|
||||
uint val;
|
||||
for (int j = 0; j < 8; j++)
|
||||
{
|
||||
char hex_char = s[++i];
|
||||
if (!hex_char.is_xdigit()) return INVALID_UNICODE_ESCAPE?;
|
||||
val = val << 4 + (hex_char > '9' ? (hex_char | 32) - 'a' + 10 : hex_char - '0');
|
||||
}
|
||||
result.append_char32(val);
|
||||
default:
|
||||
return INVALID_ESCAPE_SEQUENCE?;
|
||||
}
|
||||
}
|
||||
|
||||
return result.copy_str(allocator);
|
||||
}
|
||||
|
||||
<*
|
||||
Unescape a quoted string using the temp allocator.
|
||||
|
||||
@param s : "The quoted string to unescape"
|
||||
@param allow_unquoted : "Set to true to unescape strings not surrounded by quotes, defaults to false"
|
||||
@return "The unescaped string without quotes"
|
||||
@return? UNTERMINATED_STRING, INVALID_ESCAPE_SEQUENCE, INVALID_HEX_ESCAPE, INVALID_UNICODE_ESCAPE
|
||||
*>
|
||||
fn String? String.tunescape(String s, bool allow_unquoted = false) => s.unescape(tmem, allow_unquoted);
|
||||
|
||||
<*
|
||||
Check if a character needs to be escaped in a string literal.
|
||||
|
||||
@param c : "The character to check"
|
||||
@return "True if the character needs escaping"
|
||||
*>
|
||||
fn bool needs_escape(char c)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case '"':
|
||||
case '\\':
|
||||
case '\b':
|
||||
case '\f':
|
||||
case '\n':
|
||||
case '\r':
|
||||
case '\t':
|
||||
case '\v':
|
||||
case '\0':
|
||||
return true;
|
||||
default:
|
||||
return c < 32 || c > 126;
|
||||
}
|
||||
}
|
||||
@@ -79,6 +79,7 @@
|
||||
- `is_array_or_slice_of_char` and `is_arrayptr_or_slice_of_char` are replaced by constant `@` variants.
|
||||
- `@pool` now has an optional `reserve` parameter, some minor changes to the temp_allocator API
|
||||
- io::struct_to_format now supports bitstructs.
|
||||
- Add `String.escape`, `String.unescape` for escaping and unescaping a string.
|
||||
|
||||
## 0.7.2 Change list
|
||||
|
||||
|
||||
204
test/unit/stdlib/core/string_escape.c3
Normal file
204
test/unit/stdlib/core/string_escape.c3
Normal file
@@ -0,0 +1,204 @@
|
||||
module std::core::test::string::test @test;
|
||||
|
||||
struct EscapeTest
|
||||
{
|
||||
String input;
|
||||
String expected_escaped;
|
||||
String expected_content_escaped;
|
||||
}
|
||||
|
||||
EscapeTest[] escape_tests = {
|
||||
// Basic strings
|
||||
{ "hello", `"hello"`, "hello" },
|
||||
{ "", `""`, "" },
|
||||
|
||||
// Special characters that need escaping
|
||||
{ "hello\"world", `"hello\"world"`, `hello\"world` },
|
||||
{ "path\\to\\file", `"path\\to\\file"`, `path\\to\\file` },
|
||||
{ "line1\nline2", `"line1\nline2"`, `line1\nline2` },
|
||||
{ "tab\there", `"tab\there"`, `tab\there` },
|
||||
{ "carriage\rreturn", `"carriage\rreturn"`, `carriage\rreturn` },
|
||||
{ "backspace\bchar", `"backspace\bchar"`, `backspace\bchar` },
|
||||
{ "form\ffeed", `"form\ffeed"`, `form\ffeed` },
|
||||
{ "vertical\vtab", `"vertical\vtab"`, `vertical\vtab` },
|
||||
{ "null\0char", `"null\0char"`, `null\0char` },
|
||||
|
||||
// Non-printable characters (should use hex escapes)
|
||||
{ "\x01\x1f\x7f", `"\x01\x1f\x7f"`, `\x01\x1f\x7f` },
|
||||
|
||||
// Mixed content
|
||||
{ "Hello\nWorld\t!", `"Hello\nWorld\t!"`, `Hello\nWorld\t!` },
|
||||
{ "Quote: \"Hello\"", `"Quote: \"Hello\""`, `Quote: \"Hello\"` },
|
||||
};
|
||||
|
||||
struct UnescapeTest
|
||||
{
|
||||
String input;
|
||||
String expected;
|
||||
fault expected_error;
|
||||
}
|
||||
|
||||
UnescapeTest[] unescape_tests = {
|
||||
// Valid cases
|
||||
{ `"hello"`, "hello", {} },
|
||||
{ `""`, "", {} },
|
||||
{ `"hello\"world"`, "hello\"world", {} },
|
||||
{ `"path\\to\\file"`, "path\\to\\file", {} },
|
||||
{ `"line1\nline2"`, "line1\nline2", {} },
|
||||
{ `"tab\there"`, "tab\there", {} },
|
||||
{ `"carriage\rreturn"`, "carriage\rreturn", {} },
|
||||
{ `"backspace\bchar"`, "backspace\bchar", {} },
|
||||
{ `"form\ffeed"`, "form\ffeed", {} },
|
||||
{ `"vertical\vtab"`, "vertical\vtab", {} },
|
||||
{ `"null\0char"`, "null\0char", {} },
|
||||
{ `"slash\/works"`, "slash/works", {} },
|
||||
|
||||
// Hex escapes
|
||||
{ `"\x41\x42\x43"`, "ABC", {} },
|
||||
{ `"\x00\x1f\x7f"`, "\x00\x1f\x7f", {} },
|
||||
|
||||
// Unicode escapes
|
||||
{ `"\u0041\u0042\u0043"`, "ABC", {} },
|
||||
{ `"\u2603"`, "☃", {} }, // Snowman
|
||||
{ `"\U0001F600"`, "😀", {} }, // Grinning face emoji
|
||||
|
||||
// Error cases
|
||||
{ `"unterminated`, "", string::UNTERMINATED_STRING },
|
||||
{ `unterminated"`, "", string::UNTERMINATED_STRING },
|
||||
{ `"invalid\q"`, "", string::INVALID_ESCAPE_SEQUENCE },
|
||||
{ `"incomplete\"`, "", string::INVALID_ESCAPE_SEQUENCE },
|
||||
{ `"bad\x"`, "", string::INVALID_HEX_ESCAPE },
|
||||
{ `"bad\xG1"`, "", string::INVALID_HEX_ESCAPE },
|
||||
{ `"bad\u"`, "", string::INVALID_UNICODE_ESCAPE },
|
||||
{ `"bad\uGGGG"`, "", string::INVALID_UNICODE_ESCAPE },
|
||||
{ `"bad\U"`, "", string::INVALID_UNICODE_ESCAPE },
|
||||
{ `"bad\UGGGGGGGG"`, "", string::INVALID_UNICODE_ESCAPE },
|
||||
};
|
||||
|
||||
fn void test_escape()
|
||||
{
|
||||
foreach (test : escape_tests)
|
||||
{
|
||||
String result = test.input.tescape();
|
||||
assert(result == test.expected_escaped,
|
||||
"escape(%s) = %s, expected %s",
|
||||
test.input, result, test.expected_escaped);
|
||||
}
|
||||
}
|
||||
|
||||
fn void test_escape_content()
|
||||
{
|
||||
foreach (test : escape_tests)
|
||||
{
|
||||
String result = test.input.tescape(strip_quotes: true);
|
||||
assert(result == test.expected_content_escaped,
|
||||
"escape_content(%s) = %s, expected %s",
|
||||
test.input, result, test.expected_content_escaped);
|
||||
}
|
||||
}
|
||||
|
||||
fn void test_unescape()
|
||||
{
|
||||
foreach (test : unescape_tests)
|
||||
{
|
||||
String? result = test.input.tunescape();
|
||||
|
||||
if (test.expected_error)
|
||||
{
|
||||
// Expecting an error
|
||||
if (catch err = result)
|
||||
{
|
||||
assert(err == test.expected_error,
|
||||
"unescape(%s) failed with %s, expected %s",
|
||||
test.input, err, test.expected_error);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false, "unescape(%s) should have failed with %s",
|
||||
test.input, test.expected_error);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Expecting success
|
||||
if (try actual = result)
|
||||
{
|
||||
assert(actual == test.expected,
|
||||
"unescape(%s) = %s, expected %s",
|
||||
test.input, actual, test.expected);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false, "unescape(%s) failed unexpectedly", test.input);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn void test_roundtrip()
|
||||
{
|
||||
String[] test_strings = {
|
||||
"hello world",
|
||||
"special chars: \n\t\r\"\\",
|
||||
"unicode: ☃ 😀",
|
||||
"mixed: Hello\nWorld\t!",
|
||||
"",
|
||||
"\x00\x01\x1f\x7f",
|
||||
};
|
||||
|
||||
foreach (original : test_strings)
|
||||
{
|
||||
String escaped = original.tescape();
|
||||
String? unescaped = escaped.tunescape();
|
||||
|
||||
if (try actual = unescaped)
|
||||
{
|
||||
assert(actual == original,
|
||||
"roundtrip failed for %s: got %s",
|
||||
original, actual);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false, "roundtrip failed for %s: couldn't unescape %s",
|
||||
original, escaped);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn void test_needs_escape()
|
||||
{
|
||||
// Characters that need escaping
|
||||
assert(string::needs_escape('"'));
|
||||
assert(string::needs_escape('\\'));
|
||||
assert(string::needs_escape('\n'));
|
||||
assert(string::needs_escape('\t'));
|
||||
assert(string::needs_escape('\r'));
|
||||
assert(string::needs_escape('\b'));
|
||||
assert(string::needs_escape('\f'));
|
||||
assert(string::needs_escape('\v'));
|
||||
assert(string::needs_escape('\0'));
|
||||
assert(string::needs_escape('\x01'));
|
||||
assert(string::needs_escape('\x1f'));
|
||||
assert(string::needs_escape('\x7f'));
|
||||
|
||||
// Characters that don't need escaping
|
||||
assert(!string::needs_escape('a'));
|
||||
assert(!string::needs_escape('Z'));
|
||||
assert(!string::needs_escape('0'));
|
||||
assert(!string::needs_escape('9'));
|
||||
assert(!string::needs_escape(' '));
|
||||
assert(!string::needs_escape('!'));
|
||||
assert(!string::needs_escape('~'));
|
||||
}
|
||||
|
||||
fn void test_escape_len()
|
||||
{
|
||||
foreach (test : escape_tests)
|
||||
{
|
||||
usz calculated_len = string::escape_len(test.input);
|
||||
usz actual_len = test.expected_escaped.len;
|
||||
assert(calculated_len == actual_len,
|
||||
"escape_len(%s) = %d, but actual escaped length is %d",
|
||||
test.input, calculated_len, actual_len);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user