c3c/lib/std/core/string.c3

module std::core::string;
import std::ascii;
import std::io;

typedef String @if(!$defined(String)) = inline char[];
typedef ZString = inline char*;
typedef WString = inline Char16*;

alias Char32 = uint;
alias Char16 = ushort;

faultdef INVALID_UTF8, INVALID_UTF16, CONVERSION_FAILED,
         EMPTY_STRING, NEGATIVE_VALUE, MALFORMED_INTEGER,
         INTEGER_OVERFLOW, MALFORMED_FLOAT, FLOAT_OUT_OF_RANGE;

const uint SURROGATE_OFFSET @private = 0x10000;
const uint SURROGATE_GENERIC_MASK @private = 0xF800;
const uint SURROGATE_MASK @private = 0xFC00;
const uint SURROGATE_CODEPOINT_MASK @private = 0x03FF;
const uint SURROGATE_BITS @private = 10;
const uint SURROGATE_LOW_VALUE @private = 0xDC00;
const uint SURROGATE_HIGH_VALUE @private = 0xD800;

macro Char32* @wstring32(String $string) @builtin
{
	return (Char32*)&&$$wstr32($string);
}

macro Char32[] @char32(String $string) @builtin
{
	return $$wstr32($string)[..^2];
}

macro WString @wstring(String $string) @builtin
{
	return (WString)&&$$wstr16($string);
}

macro Char16[] @char16(String $string) @builtin
{
	return $$wstr16($string)[..^2];
}

<*
 Return a temporary ZString created using the formatting function.

 @param [in] fmt : `The formatting string`
*>
fn ZString tformat_zstr(String fmt, args...)
{
	DString str = dstring::temp_with_capacity(fmt.len + args.len * 8);
	str.appendf(fmt, ...args);
	return str.zstr_view();
}

<*
 Return a new String created using the formatting function.

 @param [inout] allocator : `The allocator to use`
 @param [in] fmt : `The formatting string`
*>
fn String format(Allocator allocator, String fmt, args...) => @pool()
{
	DString str = dstring::temp_with_capacity(fmt.len + args.len * 8);
	str.appendf(fmt, ...args);
	return str.copy_str(allocator);
}

<*
 Return a temporary String created using the formatting function.

 @param [in] fmt : `The formatting string`
*>
fn String tformat(String fmt, args...)
{
	DString str = dstring::temp_with_capacity(fmt.len + args.len * 8);
	str.appendf(fmt, ...args);
	return str.str_view();
}

<*
 Check if a character is in a set.

 @param c : `the character to check`
 @param [in] set : `The formatting string`
 @pure
 @return `True if a character is in the set`
*>
macro bool char_in_set(char c, String set)
{
	foreach (ch : set) if (ch == c) return true;
	return false;
}

fn String join(Allocator allocator, String[] s, String joiner)
{
	if (!s)
	{
		return (String)allocator::new_array(allocator, char, 2)[:0];
	}

	usz total_size = joiner.len * s.len;
	foreach (String* &str : s)
	{
		total_size += str.len;
	}
	@pool()
	{
		DString res = dstring::temp_with_capacity(total_size);
		res.append(s[0]);
		foreach (String* &str : s[1..])
		{
			res.append(joiner);
			res.append(*str);
		}
		return res.copy_str(allocator);
	};
}

<*
 Remove characters from the front and end of a string.

 @param [in] string : `The string to trim`
 @param [in] to_trim : `The set of characters to trim, defaults to whitespace`
 @pure
 @return `a substring of the string passed in`
*>
fn String String.trim(string, String to_trim = "\t\n\r ")
{
    return string.trim_left(to_trim).trim_right(to_trim);
}

<*
 Remove characters from the front of a string.

 @param [in] string : `The string to trim`
 @param [in] to_trim : `The set of characters to trim, defaults to whitespace`
 @pure
 @return `a substring of the string passed in`
*>
fn String String.trim_left(string, String to_trim = "\t\n\r ")
{
	usz start = 0;
	usz len = string.len;
	while (start < len && char_in_set(string[start], to_trim)) start++;
	if (start == len) return string[:0];
	return string[start..];
}

<*
 Remove characters from the end of a string.

 @param [in] string : `The string to trim`
 @param [in] to_trim : `The set of characters to trim, defaults to whitespace`
 @pure
 @return `a substring of the string passed in`
*>
fn String String.trim_right(string, String to_trim = "\t\n\r ")
{
	usz len = string.len;
	while (len > 0 && char_in_set(string[len - 1], to_trim)) len--;
	return string[:len];
}

<*
 Check if the String starts with the needle.

 @param [in] string
 @param [in] needle
 @pure
 @return `'true' if the string starts with the needle`
*>
fn bool String.starts_with(string, String needle)
{
	if (needle.len > string.len) return false;
	if (!needle.len) return true;
	return string[:needle.len] == needle;
}

<*
 Check if the String ends with the needle.

 @param [in] string
 @param [in] needle
 @pure
 @return `'true' if the string ends with the needle`
*>
fn bool String.ends_with(string, String needle)
{
	if (needle.len > string.len) return false;
	if (!needle.len) return true;
	return string[^needle.len..] == needle;
}

<*
 Strip the front of the string if the prefix exists.

 @param [in] string
 @param [in] needle
 @pure
 @return `the substring with the prefix removed`
*>
fn String String.strip(string, String needle)
{
	if (!needle.len || !string.starts_with(needle)) return string;
	return string[needle.len..];
}

<*
 Strip the end of the string if the suffix exists.

 @param [in] string
 @param [in] needle
 @pure
 @return `the substring with the suffix removed`
*>
fn String String.strip_end(string, String needle)
{
	if (!needle.len || !string.ends_with(needle)) return string;
	// Note that this is the safe way if we want to support zero length.
	return string[:(string.len - needle.len)];
}

<*
 Split a string into parts, e.g "a|b|c" split with "|" yields { "a", "b", "c" }

 @param [in] s
 @param [in] needle
 @param max : "Max number of elements, 0 means no limit, defaults to 0"
 @param skip_empty : "True to skip empty elements"
 @param [&inout] allocator : "The allocator to use for the String[]"

 @require needle.len > 0 : "The needle must be at least 1 character long"
 @ensure return.len > 0
*>
fn String[] String.split(s, Allocator allocator, String needle, usz max = 0, bool skip_empty = false)
{
	usz capacity = 16;
	usz i = 0;
	String* holder = allocator::alloc_array(allocator, String, capacity);
	bool no_more = false;
	while (!no_more)
	{
		usz? index = i == max - 1 ? NOT_FOUND? : s.index_of(needle);
		String res @noinit;
		if (try index)
		{
			res = s[:index];
			s = s[index + needle.len..];
		}
		else
		{
			res = s;
			no_more = true;
		}
		if (!res.len && skip_empty)
        {
        	continue;
        }

		if (i == capacity)
		{
			capacity *= 2;
			holder = allocator::realloc(allocator, holder, String.sizeof * capacity);
		}
		holder[i++] = res;
	}
	return holder[:i];
}


<*
 This function is identical to String.split, but implicitly uses the
 temporary allocator.

 @param [in] s
 @param [in] needle
 @param max : "Max number of elements, 0 means no limit, defaults to 0"
 @param skip_empty : "True to skip empty elements"
*>
fn String[] String.tsplit(s, String needle, usz max = 0, bool skip_empty = false) => s.split(tmem(), needle, max, skip_empty) @inline;

faultdef BUFFER_EXCEEDED;

<*
 Split a string into parts, e.g "a|b|c" split with "|" yields { "a", "b", "c" }

 @param [in] s
 @param [in] needle
 @param [inout] buffer
 @param max : "Max number of elements, 0 means no limit, defaults to 0"
 @require needle.len > 0 : "The needle must be at least 1 character long"
 @ensure return.len > 0
 @return? BUFFER_EXCEEDED : `If there are more elements than would fit the buffer`
*>
fn String[]? String.split_to_buffer(s, String needle, String[] buffer, usz max = 0, bool skip_empty = false)
{
	usz max_capacity = buffer.len;
	usz i = 0;
	bool no_more = false;
	while (!no_more)
	{
		usz? index = i == max - 1 ? NOT_FOUND? : s.index_of(needle);
		String res @noinit;
		if (try index)
		{
			res = s[:index];
			s = s[index + needle.len..];
		}
		else
		{
			res = s;
			no_more = true;
		}
		if (!res.len && skip_empty)
		{
			continue;
		}
		if (i == max_capacity)
		{
			return BUFFER_EXCEEDED?;
		}
		buffer[i++] = res;
	}
	return buffer[:i];
}

<*
 Check if a substring is found in the string.

 @param [in] s
 @param [in] needle : "The string to look for."
 @pure
 @return "true if the string contains the substring, false otherwise"
*>
fn bool String.contains(s, String needle)
{
	return @ok(s.index_of(needle));
}

<*
 Find the index of the first incidence of a string.

 @param [in] s
 @param needle : "The character to look for"
 @pure
 @ensure return < s.len
 @return "the index of the needle"
 @return? NOT_FOUND : "if the needle cannot be found"
*>
fn usz? String.index_of_char(s, char needle)
{
	foreach (i, c : s)
	{
		if (c == needle) return i;
	}
	return NOT_FOUND?;
}

<*
 Find the index of the first incidence of a one of the chars.

 @param [in] s
 @param [in] needle : "The characters to look for"
 @pure
 @ensure return < s.len
 @return "the index of the needle"
 @return? NOT_FOUND : "if the needle cannot be found"
*>
fn usz? String.index_of_chars(String s, char[] needle)
{
    foreach (i, c : s)
    {
        foreach (j, pin : needle)
        {
            if (c == pin) return i;
        }
    }

    return NOT_FOUND?;
}

<*
 Find the index of the first incidence of a character.

 @param [in] s
 @param needle : "The character to look for"
 @param start_index : "The index to start with, may exceed max index."
 @pure
 @ensure return < s.len
 @return "the index of the needle"
 @return? NOT_FOUND : "if the needle cannot be found starting from the start_index"
*>
fn usz? String.index_of_char_from(s, char needle, usz start_index)
{
	usz len = s.len;
	if (len <= start_index) return NOT_FOUND?;
	for (usz i = start_index; i < len; i++)
	{
		if (s[i] == needle) return i;
	}
	return NOT_FOUND?;
}

<*
 Find the index of the first incidence of a character starting from the end.

 @param [in] s
 @param needle : "the character to find"
 @pure
 @ensure return < s.len
 @return "the index of the needle"
 @return? NOT_FOUND : "if the needle cannot be found"
*>
fn usz? String.rindex_of_char(s, char needle)
{
	foreach_r (i, c : s)
	{
		if (c == needle) return i;
	}
	return NOT_FOUND?;
}

<*
 Find the index of the first incidence of a string.

 @param [in] s
 @param [in] needle
 @pure
 @ensure return < s.len
 @require needle.len > 0 : "The needle must be len 1 or more"
 @return "the index of the needle"
 @return? NOT_FOUND : "if the needle cannot be found"
*>
fn usz? String.index_of(s, String needle)
{
	usz needed = needle.len;
	if (needed > 0 && s.len >= needed)
	{
		char first = needle[0];
		foreach (i, c: s[..^needed])
		{
			if (c == first && s[i:needed] == needle) return i;
		}
	}
	return NOT_FOUND?;
}

<*
 Find the index of the last incidence of a string.

 @param [in] s
 @param [in] needle
 @pure
 @ensure return < s.len
 @require needle.len > 0 : "The needle must be len 1 or more"
 @return "the index of the needle"
 @return? NOT_FOUND : "if the needle cannot be found"
*>
fn usz? String.rindex_of(s, String needle)
{
	usz needed = needle.len;
	if (needed > 0 && s.len >= needed)
	{
		char first = needle[0];
		foreach_r (i, c: s[..^needed])
		{
			if (c == first && s[i:needed] == needle) return i;
		}
	}
	return NOT_FOUND?;
}

fn String ZString.str_view(str)
{
	return (String)(str[:str.len()]);
}

fn usz ZString.char_len(str)
{
	usz len = 0;
	char* ptr = (char*)str;
	while (char c = ptr++[0])
	{
		if (c & 0xC0 != 0x80) len++;
	}
	return len;
}

fn usz ZString.len(str)
{
	usz len = 0;
	char* ptr = (char*)str;
	while (char c = ptr++[0]) len++;
	return len;
}


fn ZString String.zstr_copy(s, Allocator allocator)
{
	usz len = s.len;
	char* str = allocator::malloc(allocator, len + 1);
	mem::copy(str, s.ptr, len);
	str[len] = 0;
	return (ZString)str;
}

fn String String.concat(s1, Allocator allocator, String s2)
{
	usz full_len = s1.len + s2.len;
	char* str = allocator::malloc(allocator, full_len + 1);
	usz s1_len = s1.len;
	mem::copy(str, s1.ptr, s1_len);
	mem::copy(str + s1_len, s2.ptr, s2.len);
	str[full_len] = 0;
	return (String)str[:full_len];
}

fn String String.tconcat(s1, String s2) => s1.concat(tmem(), s2);


fn ZString String.zstr_tcopy(s) => s.zstr_copy(tmem()) @inline;

<*
 Copy this string, by duplicating the string, always adding a zero byte
 sentinel, so that it safely can be converted to a ZString by a
 cast.
*>
fn String String.copy(s, Allocator allocator)
{
	usz len = s.len;
	char* str = allocator::malloc(allocator, len + 1);
	mem::copy(str, s.ptr, len);
	str[len] = 0;
	return (String)str[:len];
}

fn void String.free(&s, Allocator allocator)
{
	if (!s.ptr) return;
	allocator::free(allocator, s.ptr);
	*s = "";
}

fn String String.tcopy(s) => s.copy(tmem()) @inline;

fn String ZString.copy(z, Allocator allocator)
{
	return z.str_view().copy(allocator) @inline;
}

fn String ZString.tcopy(z)
{
	return z.str_view().copy(tmem()) @inline;
}

<*
 Convert an UTF-8 string to UTF-16
 @return "The UTF-16 string as a slice, allocated using the given allocator"
 @return? INVALID_UTF8 : "If the string contained an invalid UTF-8 sequence"
*>
fn Char16[]? String.to_utf16(s, Allocator allocator)
{
	usz len16 = conv::utf16len_for_utf8(s);
	Char16* data = allocator::alloc_array_try(allocator, Char16, len16 + 1)!;
	conv::utf8to16_unsafe(s, data)!;
	data[len16] = 0;
	return data[:len16];
}

fn Char16[]? String.to_temp_utf16(s) => s.to_utf16(tmem());

fn WString? String.to_wstring(s, Allocator allocator)
{
	return (WString)s.to_utf16(allocator).ptr;
}

fn WString? String.to_temp_wstring(s) => s.to_wstring(tmem());

fn Char32[]? String.to_utf32(s, Allocator allocator)
{
	usz codepoints = conv::utf8_codepoints(s);
	Char32* data = allocator::alloc_array_try(allocator, Char32, codepoints + 1)!;
	conv::utf8to32_unsafe(s, data)!;
	data[codepoints] = 0;
	return data[:codepoints];
}

fn Char32[]? String.to_temp_utf32(s) => s.to_utf32(tmem());

<*
 Convert a string to ASCII lower case in place.

 @param [inout] s
 @pure
*>
fn void String.convert_to_lower(s)
{
	foreach (&c : s) if (c.is_upper() @pure) *c += 'a' - 'A';
}

fn String String.to_lower_copy(s, Allocator allocator)
{
	String copy = s.copy(allocator);
	copy.convert_to_lower();
	return copy;
}

fn String String.to_lower_tcopy(s)
{
	return s.to_lower_copy(tmem());
}

<*
 Convert a string to ASCII upper case.

 @param [inout] s
 @pure
*>
fn void String.convert_to_upper(s)
{
	foreach (&c : s) if (c.is_lower() @pure) *c -= 'a' - 'A';
}

<*
 Returns a string converted to ASCII upper case.

 @param [in] s
 @param [inout] allocator

 @return `a new String converted to ASCII upper case.`
*>
fn String String.to_upper_copy(s, Allocator allocator)
{
	String copy = s.copy(allocator);
	copy.convert_to_upper();
	return copy;
}

fn StringIterator String.iterator(s)
{
	return { s, 0 };
}

<*
 @param [in] s
 @return `a temporary String converted to ASCII upper case.`
*>
fn String String.to_upper_tcopy(s)
{
	return s.to_upper_copy(tmem());
}

fn String? from_utf32(Allocator allocator, Char32[] utf32)
{
	usz len = conv::utf8len_for_utf32(utf32);
	char* data = allocator::malloc_try(allocator, len + 1)!;
	defer catch allocator::free(allocator, data);
	conv::utf32to8_unsafe(utf32, data);
	data[len] = 0;
	return (String)data[:len];
}

fn String? from_utf16(Allocator allocator, Char16[] utf16)
{
	usz len = conv::utf8len_for_utf16(utf16);
	char* data = allocator::malloc_try(allocator, len + 1)!;
	defer catch allocator::free(allocator, data);
	conv::utf16to8_unsafe(utf16, data)!;
	data[len] = 0;
	return (String)data[:len];
}

fn String? from_wstring(Allocator allocator, WString wstring)
{
	usz utf16_len;
	while (wstring[utf16_len] != 0) utf16_len++;
	Char16[] utf16 = wstring[:utf16_len];
	return from_utf16(allocator, utf16);
}

fn String? tfrom_wstring(WString wstring) => from_wstring(tmem(), wstring) @inline;
fn String? tfrom_utf16(Char16[] utf16) => from_utf16(tmem(), utf16) @inline;

fn usz String.utf8_codepoints(s)
{
	usz len = 0;
	foreach (char c : s)
	{
		if (c & 0xC0 != 0x80) len++;
	}
	return len;
}


<*
 @require (base <= 10 && base > 1) || base == 16 : "Unsupported base"
*>
macro String.to_integer(string, $Type, int base = 10)
{
	usz len = string.len;
	usz index = 0;
	char* ptr = string.ptr;
	while (index < len && ascii::is_blank_m(ptr[index])) index++;
	if (len == index) return EMPTY_STRING?;
	bool is_negative;
	switch (string[index])
	{
		case '-':
			if ($Type.min == 0) return NEGATIVE_VALUE?;
			is_negative = true;
			index++;
		case '+':
			index++;
		default:
			break;
	}
	if (len == index) return MALFORMED_INTEGER?;
	$Type base_used = ($Type)base;
	if (string[index] == '0' && base == 10)
	{
		index++;
		if (index == len) return ($Type)0;
		switch (string[index])
		{
			case 'x':
			case 'X':
				base_used = 16;
				index++;
			case 'b':
			case 'B':
				base_used = 2;
				index++;
			case 'o':
			case 'O':
				base_used = 8;
				index++;
			default:
				break;
		}
		if (len == index) return MALFORMED_INTEGER?;
	}
	$Type value = 0;
	while (index != len)
	{
		char c = string[index++];
		switch
		{
			case base_used != 16 || c < 'A':  c -= '0';
			case c <= 'F':                    c -= 'A' - 10;
			case c < 'a' || c > 'f':          return MALFORMED_INTEGER?;
			default:                          c -= 'a' - 10;
		}
		if (c >= base_used) return MALFORMED_INTEGER?;
		do
		{
			if (is_negative)
			{
				$Type new_value = value * base_used - c;
				if (new_value > value) return INTEGER_OVERFLOW?;
				value = new_value;
				break;
			}
			$Type new_value = value * base_used + c;
			if (new_value < value) return INTEGER_OVERFLOW?;
			value = new_value;
		};
	}
	return value;
}

fn int128? String.to_int128(s, int base = 10) => s.to_integer(int128, base);
fn long? String.to_long(s, int base = 10) => s.to_integer(long, base);
fn int? String.to_int(s, int base = 10) => s.to_integer(int, base);
fn short? String.to_short(s, int base = 10) => s.to_integer(short, base);
fn ichar? String.to_ichar(s, int base = 10) => s.to_integer(ichar, base);

fn uint128? String.to_uint128(s, int base = 10) => s.to_integer(uint128, base);
fn ulong? String.to_ulong(s, int base = 10) => s.to_integer(ulong, base);
fn uint? String.to_uint(s, int base = 10) => s.to_integer(uint, base);
fn ushort? String.to_ushort(s, int base = 10) => s.to_integer(ushort, base);
fn char? String.to_uchar(s, int base = 10) => s.to_integer(char, base);

fn double? String.to_double(s) => s.to_real(double);
fn float? String.to_float(s) => s.to_real(float);

fn Splitter String.splitter(self, String split)
{
	return { .string = self, .split = split };
}

fn Splitter String.tokenize(self, String split)
{
	return { .string = self, .split = split, .tokenize = true };
}

struct Splitter
{
	String string;
	String split;
	usz current;
	bool tokenize;
	int last_index;
}

fn void Splitter.reset(&self)
{
	self.current = 0;
}

fn String? Splitter.next(&self)
{
	while (true)
	{
		usz len = self.string.len;
		usz current = self.current;
		if (current >= len) return NO_MORE_ELEMENT?;
		String remaining = self.string[current..];
		usz? next = remaining.index_of(self.split);
		if (try next)
		{
			self.current = current + next + self.split.len;
			if (!next && self.tokenize) continue;
			return remaining[:next];
		}
		self.current = len;
		return remaining;
	}
}

macro String from_struct(Allocator allocator, x)
{
	DString s;
	@stack_mem(512; Allocator mem)
	{
		s.init(allocator: mem);
		io::fprint(&s, x)!!;
		return s.copy_str(allocator);
	};
}

macro String tfrom_struct(x) => from_struct(tmem(), x);