Allow [in] to be used on subarray types. Added more to "conv" module.

2026-02-27 12:01:16 +00:00 · 2022-07-09 19:32:39 +02:00
parent 9fdd66af42
commit ca21b1daac
4 changed files with 435 additions and 143 deletions
--- a/lib/std/core/conv.c3
+++ b/lib/std/core/conv.c3
@@ -1,2 +1,406 @@
 module std::core::string::conv;

+private const uint UTF16_SURROGATE_OFFSET = 0x10000;
+private const uint UTF16_SURROGATE_GENERIC_MASK = 0xF800;
+private const uint UTF16_SURROGATE_GENERIC_VALUE = 0xD800;
+private const uint UTF16_SURROGATE_MASK = 0xFC00;
+private const uint UTF16_SURROGATE_CODEPOINT_MASK = 0x03FF;
+private const uint UTF16_SURROGATE_BITS = 10;
+private const uint UTF16_SURROGATE_LOW_VALUE = 0xDC00;
+private const uint UTF16_SURROGATE_HIGH_VALUE = 0xD800;
+
+/**
+ * @param c `The utf32 codepoint to convert`
+ * @param [out] output `the resulting buffer`
+ * @param [inout] size `the size available`
+ **/
+fn void! char32_to_utf8(Char32 c, char* output, usize *size)
+{
+	usize available = *size;
+	if (!available) return UnicodeResult.CONVERSION_FAILED!;
+	switch (true)
+	{
+		case c < 0x7f:
+			output[0] = (char)c;
+			*size = 1;
+		case c < 0x7ff:
+	        if (available < 2) return UnicodeResult.CONVERSION_FAILED!;
+			output[0] = (char)(0xC0 | c >> 6);
+            output[1] = (char)(0x80 | (c & 0x3F));
+            *size = 2;
+        case c < 0xffff:
+	        if (available < 3) return UnicodeResult.CONVERSION_FAILED!;
+            output[0] = (char)(0xE0 | c >> 12);
+            output[1] = (char)(0x80 | (c >> 6 & 0x3F));
+            output[2] = (char)(0x80 | (c & 0x3F));
+            *size = 3;
+        default:
+	        if (available < 4) return UnicodeResult.CONVERSION_FAILED!;
+		    output[0] = (char)(0xF0 | c >> 18);
+            output[1] = (char)(0x80 | (c >> 12 & 0x3F));
+            output[2] = (char)(0x80 | (c >> 6 & 0x3F));
+            output[3] = (char)(0x80 | (c & 0x3F));
+            *size = 4;
+	}
+}
+
+/**
+ * Convert a code pointer into 1-2 UTF16 characters.
+ *
+ * @param c `The character to convert.`
+ * @param [inout] output `the resulting UTF16 buffer to write to.`
+ **/
+fn void char32_to_utf16_unsafe(Char32 c, Char16** output)
+{
+	if (c < UTF16_SURROGATE_OFFSET)
+	{
+		(*output)++[0] = (Char16)c;
+		return;
+	}
+	c -= UTF16_SURROGATE_OFFSET;
+	Char16 low = (Char16)(UTF16_SURROGATE_LOW_VALUE | (c & UTF16_SURROGATE_CODEPOINT_MASK));
+	c >>= UTF16_SURROGATE_BITS;
+	Char16 high = (Char16)(UTF16_SURROGATE_HIGH_VALUE | (c & UTF16_SURROGATE_CODEPOINT_MASK));
+	(*output)++[0] = (Char16)high;
+	(*output)++[0] = (Char16)low;
+}
+
+/**
+ * Convert 1-2 UTF16 data points into UTF8.
+ *
+ * @param [in] ptr `The UTF16 data to convert.`
+ * @param [inout] available `amount of UTF16 data available.`
+ * @param [inout] output `the resulting utf8 buffer to write to.`
+ **/
+fn void! char16_to_utf8_unsafe(Char16 *ptr, usize *available, char** output)
+{
+	Char16 high = *ptr;
+	if (high & UTF16_SURROGATE_GENERIC_MASK != UTF16_SURROGATE_GENERIC_VALUE)
+	{
+		char32_to_utf8_unsafe(high, output);
+		*available = 1;
+		return;
+	}
+    // Low surrogate first is an error
+	if (high & UTF16_SURROGATE_MASK != UTF16_SURROGATE_HIGH_VALUE) return UnicodeResult.INVALID_UTF16!;
+
+	// Unmatched high surrogate is an error
+	if (*available == 1) return UnicodeResult.INVALID_UTF16!;
+
+	Char16 low = ptr[1];
+
+	// Unmatched high surrogate, invalid
+	if (low & UTF16_SURROGATE_MASK != UTF16_SURROGATE_LOW_VALUE) return UnicodeResult.INVALID_UTF16!;
+
+    // The high bits of the codepoint are the value bits of the high surrogate
+    // The low bits of the codepoint are the value bits of the low surrogate
+    Char32 uc = (high & UTF16_SURROGATE_CODEPOINT_MASK) << UTF16_SURROGATE_BITS
+                | (low & UTF16_SURROGATE_CODEPOINT_MASK) + UTF16_SURROGATE_OFFSET;
+	char32_to_utf8_unsafe(uc, output);
+	*available = 2;
+}
+/**
+ * @param c `The utf32 codepoint to convert`
+ * @param [inout] output `the resulting buffer`
+ **/
+fn void char32_to_utf8_unsafe(Char32 c, char** output)
+{
+	switch (true)
+	{
+		case c < 0x7f:
+			(*output)++[0] = (char)c;
+		case c < 0x7ff:
+			(*output)++[0] = (char)(0xC0 | c >> 6);
+            (*output)++[0] = (char)(0x80 | (c & 0x3F));
+        case c < 0xffff:
+	        (*output)++[0] = (char)(0xE0 | c >> 12);
+            (*output)++[0] = (char)(0x80 | (c >> 6 & 0x3F));
+            (*output)++[0] = (char)(0x80 | (c & 0x3F));
+        default:
+	 	    (*output)++[0] = (char)(0xF0 | c >> 18);
+            (*output)++[0] = (char)(0x80 | (c >> 12 & 0x3F));
+            (*output)++[0] = (char)(0x80 | (c >> 6 & 0x3F));
+            (*output)++[0] = (char)(0x80 | (c & 0x3F));
+	}
+}
+
+/**
+ * @param [in] ptr `pointer to the first character to parse`
+ * @param [inout] size `Set to max characters to read, set to characters read`
+ * @return `the parsed 32 bit codepoint`
+ **/
+fn Char32! utf8_to_char32(char* ptr, usize* size)
+{
+	usize max_size = *size;
+	if (max_size < 1) return UnicodeResult.INVALID_UTF8!;
+	char c = (ptr++)[0];
+
+    if ((c & 0x80) == 0)
+    {
+        *size = 1;
+        return c;
+    }
+    if ((c & 0xE0) == 0xC0)
+    {
+		if (max_size < 2) return UnicodeResult.INVALID_UTF8!;
+        *size = 2;
+        Char32 uc = (c & 0x1F) << 6;
+		c = *ptr;
+		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+		return uc + c & 0x3F;
+    }
+    if ((c & 0xF0) == 0xE0)
+    {
+		if (max_size < 3) return UnicodeResult.INVALID_UTF8!;
+        *size = 3;
+        Char32 uc = (c & 0x0F) << 12;
+		c = ptr++[0];
+		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+		uc += (c & 0x3F) << 6;
+		c = ptr++[0];
+		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+		return uc + c & 0x3F;
+    }
+    if (max_size < 4) return UnicodeResult.INVALID_UTF8!;
+    *size = 4;
+    Char32 uc = (c & 0x07) << 18;
+	c = ptr++[0];
+	if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+	uc += (c & 0x3F) << 12;
+	c = ptr++[0];
+	if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+	uc += (c & 0x3F) << 6;
+	c = ptr++[0];
+	if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
+	return uc + c & 0x3F;
+}
+
+/**
+ * @param utf8 `An UTF-8 encoded slice of bytes`
+ * @return `the number of encoded code points`
+ **/
+fn usize utf8_codepoints(char[] utf8)
+{
+	usize len = 0;
+	foreach (char c : utf8)
+	{
+		if (c & 0xC0 != 0x80) len++;
+	}
+	return len;
+}
+
+/**
+ * Calculate the UTF8 length required to encode an UTF32 array.
+ * @param [in] utf32 `the utf32 data to calculate from`
+ * @return `the length of the resulting UTF8 array`
+ **/
+fn usize utf8len_for_utf32(Char32[] utf32)
+{
+	usize len = 0;
+	foreach (Char32 uc : utf32)
+	{
+		switch (true)
+		{
+			case uc < 0x7f:
+				len++;
+			case uc < 0x7ff:
+				len += 2;
+			case uc < 0xffff:
+				len += 3;
+			default:
+				len += 4;
+		}
+	}
+	return len;
+}
+
+/**
+ * Calculate the UTF8 length required to encode an UTF16 array.
+ * @param [in] utf16 `the utf16 data to calculate from`
+ * @return `the length of the resulting UTF8 array`
+ **/
+fn usize utf8len_for_utf16(Char16[] utf16)
+{
+	usize len = 0;
+	usize len16 = utf16.len;
+	for (usize i = 0; i < len16; i++)
+	{
+		Char16 c = utf16[i];
+		if (c & UTF16_SURROGATE_GENERIC_MASK != UTF16_SURROGATE_GENERIC_VALUE)
+		{
+			if (c < 0x7f)
+			{
+				len++;
+				continue;
+			}
+			if (c < 0x7ff)
+			{
+				len += 2;
+				continue;
+			}
+			len += 3;
+			continue;
+		}
+		len += 4;
+	}
+	return len;
+}
+
+/**
+ * Calculate the UTF16 length required to encode a UTF8 array.
+ * @param utf8 `the utf8 data to calculate from`
+ * @return `the length of the resulting UTF16 array`
+ **/
+fn usize utf16len_for_utf8(char[] utf8)
+{
+	usize len = utf8.len;
+	usize len16 = 0;
+	for (usize i = 0; i < len; i++)
+	{
+		len16++;
+		char c = utf8[i];
+		if (c & 0x80 == 0) continue;
+		i++;
+		if (c & 0xE0 == 0xC0) continue;
+		i++;
+		if (c & 0xF0 == 0xE0) continue;
+		i++;
+		len16++;
+    }
+    return len16;
+}
+
+/**
+ * @param [in] utf32 `the UTF32 array to check the length for`
+ * @return `the required length of an UTF16 array to hold the UTF32 data.`
+ **/
+fn usize utf16len_for_utf32(Char32[] utf32)
+{
+	usize len = utf32.len;
+	foreach (Char32 uc : utf32)
+	{
+		if (uc >= UTF16_SURROGATE_OFFSET) len++;
+	}
+	return len;
+}
+
+/**
+ * Convert an UTF32 array to an UTF8 array.
+ *
+ * @param [in] utf32
+ * @param [out] utf8_buffer
+ * @return `the number of bytes written.`
+ **/
+fn usize! utf32to8(Char32[] utf32, char[] utf8_buffer)
+{
+	usize len = utf8_buffer.len;
+	char* ptr = utf8_buffer.ptr;
+	foreach (Char32 uc : utf32)
+	{
+		usize size = len;
+		char32_to_utf8(uc, ptr, &size) @inline?;
+		len -= size;
+		ptr += size;
+	}
+	return utf8_buffer.len - len;
+}
+
+/**
+ * Convert an UTF8 array to an UTF32 array.
+ *
+ * @param [in] utf8
+ * @param [out] utf32_buffer
+ * @return `the number of Char32s written.`
+ **/
+fn usize! utf8to32(char[] utf8, Char32[] utf32_buffer)
+{
+	usize len = utf8.len;
+	Char32* ptr = utf32_buffer.ptr;
+    usize len32 = 0;
+    usize buf_len = utf32_buffer.len;
+    for (usize i = 0; i < len;)
+    {
+        if (len32 == buf_len) return UnicodeResult.CONVERSION_FAILED!;
+        usize width = len - i;
+    	Char32 uc = utf8_to_char32(&utf8[i], &width) @inline?;
+    	i += width;
+    	ptr[len32++] = uc;
+    }
+    return len32;
+}
+
+/**
+ * Copy an array of UTF16 data into an UTF8 buffer without bounds
+ * checking. This will assume the buffer is sufficiently large to hold
+ * the converted data.
+ *
+ * @param [in] utf16 `The UTF16 array containing the data to convert.`
+ * @param [out] utf8_buffer `the (sufficiently large) buffer to hold the UTF16 data.`
+ **/
+fn void! utf16to8_unsafe(Char16[] utf16, char* utf8_buffer)
+{
+	usize len16 = utf16.len;
+	for (usize i = 0; i < len16;)
+	{
+		usize available = len16 - i;
+		char16_to_utf8_unsafe(&utf16[i], &available, &utf8_buffer) @inline?;
+		i += available;
+	}
+}
+
+/**
+ * Copy an array of UTF8 data into an UTF32 buffer without bounds
+ * checking. This will assume the buffer is sufficiently large to hold
+ * the converted data.
+ *
+ * @param [in] utf8 `The UTF8 buffer containing the data to convert.`
+ * @param [out] utf32_buffer `the (sufficiently large) buffer to hold the UTF8 data.`
+ **/
+fn void! utf8to32_unsafe(char[] utf8, Char32* utf32_buffer)
+{
+	usize len = utf8.len;
+	for (usize i = 0; i < len;)
+    {
+        usize width = len - i;
+    	Char32 uc = utf8_to_char32(&utf8[i], &width) @inline?;
+    	i += width;
+    	(utf32_buffer++)[0] = uc;
+    }
+}
+
+/**
+ * Copy an array of UTF8 data into an UTF16 buffer without bounds
+ * checking. This will assume the buffer is sufficiently large to hold
+ * the converted data.
+ *
+ * @param [in] utf8 `The UTF8 buffer containing the data to convert.`
+ * @param [out] utf16_buffer `the (sufficiently large) buffer to hold the UTF8 data.`
+ **/
+fn void! utf8to16_unsafe(char[] utf8, Char16* utf16_buffer)
+{
+	usize len = utf8.len;
+    for (usize i = 0; i < len;)
+    {
+        usize width = len - i;
+    	Char32 uc = utf8_to_char32(&utf8[i], &width) @inline?;
+        char32_to_utf16_unsafe(uc, &utf16_buffer) @inline;
+        i += width;
+    }
+}
+
+/**
+ * Copy an array of UTF32 code points into an UTF8 buffer without bounds
+ * checking. This will assume the buffer is sufficiently large to hold
+ * the converted data.
+ *
+ * @param [in] utf32 `The UTF32 buffer containing the data to convert.`
+ * @param [out] utf8_buffer `the (sufficiently large) buffer to hold the UTF8 data.`
+ **/
+fn void utf32to8_unsafe(Char32[] utf32, char* utf8_buffer)
+{
+	char* start = utf8_buffer;
+	foreach (Char32 uc : utf32)
+	{
+		char32_to_utf8_unsafe(uc, &utf8_buffer) @inline;
+	}
+}
--- a/lib/std/core/str.c3
+++ b/lib/std/core/str.c3
@@ -1,5 +1,4 @@
 module std::core::str;
-
 define ZString = distinct char*;
 define Char32 = uint;
 define Char16 = ushort;
@@ -53,86 +52,10 @@ fault UnicodeResult
 {
 	INVALID_UTF8,
 	INVALID_UTF16,
+	CONVERSION_FAILED,
 }

-/**
- * @param c `The utf32 codepoint to convert`
- * @param [out] output `the resulting buffer`
- *
- * @return `the number of characters written 1-4`
- **/
-fn char char32_to_utf8(Char32 c, char* output)
-{
-	if (c < 0x7f)
-	{
-		output[0] = (char)c;
-		return 1;
-    }
-    if (c < 0x7ff)
-    {
-		output[0] = (char)(0xC0 | c >> 6);
-        output[1] = (char)(0x80 | (c & 0x3F));
-        return 2;
-    }
-    if (c < 0xffff)
-    {
-        output[0] = (char)(0xE0 | c >> 12);
-        output[1] = (char)(0x80 | (c >> 6 & 0x3F));
-        output[2] = (char)(0x80 | (c & 0x3F));
-        return 3;
-    }
-    output[0] = (char)(0xF0 | c >> 18);
-    output[1] = (char)(0x80 | (c >> 12 & 0x3F));
-    output[2] = (char)(0x80 | (c >> 6 & 0x3F));
-    output[3] = (char)(0x80 | (c & 0x3F));
-    return 4;
-}

-fn Char32! utf8CharTo32(char* ptr, int* size)
-{
-	int max_size = *size;
-	if (max_size < 1) return UnicodeResult.INVALID_UTF8!;
-	char c = (ptr++)[0];
-
-    if ((c & 0x80) == 0)
-    {
-        *size = 1;
-        return c;
-    }
-    if ((c & 0xE0) == 0xC0)
-    {
-		if (max_size < 2) return UnicodeResult.INVALID_UTF8!;
-        *size = 2;
-        Char32 uc = (c & 0x1F) << 6;
-		c = *ptr;
-		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
-		return uc + c & 0x3F;
-    }
-    if ((c & 0xF0) == 0xE0)
-    {
-		if (max_size < 3) return UnicodeResult.INVALID_UTF8!;
-        *size = 3;
-        Char32 uc = (c & 0x0F) << 12;
-		c = ptr++[0];
-		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
-		uc += (c & 0x3F) << 6;
-		c = ptr++[0];
-		if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
-		return uc + c & 0x3F;
-    }
-    if (max_size < 4) return UnicodeResult.INVALID_UTF8!;
-    *size = 4;
-    Char32 uc = (c & 0x07) << 18;
-	c = ptr++[0];
-	if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
-	uc += (c & 0x3F) << 12;
-	c = ptr++[0];
-	if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
-	uc += (c & 0x3F) << 6;
-	c = ptr++[0];
-	if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!;
-	return uc + c & 0x3F;
-}

 fn usize utf8_codepoints(char[] utf8)
 {
@@ -147,78 +70,41 @@ fn usize utf8_codepoints(char[] utf8)
 fn Char32[]! utf8to32(char[] utf8, Allocator allocator = { null, null })
 {
 	if (!allocator.function) allocator = mem::current_allocator();
-	usize len = utf8.len;
-	Char32* data = allocator.alloc((len + 1) * Char32.sizeof)?;
-	usize len32 = 0;
-	for (usize i = 0; i < len;)
-	{
-		int width = (int)min(len - i, 4);
-		Char32 uc = utf8CharTo32(&utf8[i], &width) @inline?;
-		i += width;
-		data[len32++] = uc;
-	}
-	return data[0 .. len32 - 1];
+	usize codepoints = conv::utf8_codepoints(utf8);
+	Char32* data = allocator.alloc(Char32.sizeof * (codepoints + 1))?;
+	conv::utf8to32_unsafe(utf8, data)?;
+	data[codepoints] = 0;
+	return data[0..codepoints - 1];
 }

+fn char[] utf32to8(Char32[] utf32, Allocator allocator = { null, null })
+{
+	usize len = conv::utf8len_for_utf32(utf32);
+	if (!allocator.function) allocator = mem::current_allocator();
+	char* data = allocator.alloc(len + 1)!!;
+	conv::utf32to8_unsafe(utf32, data);
+	data[len] = 0;
+	return data[0..len - 1];
+}

 fn Char16[]! utf8to16(char[] utf8, Allocator allocator = { null, null })
 {
 	if (!allocator.function) allocator = mem::current_allocator();
-	usize len = utf8.len;
-	Char16* data = allocator.alloc((len + 1) * Char16.sizeof)?;
-	usize len16 = 0;
-	for (usize i = 0; i < len;)
-	{
-		int width = (int)min(len - i, 4);
-		Char32 uc = utf8CharTo32(&utf8[i], &width) @inline?;
-		i += width;
-		if (uc <= 0xFFFF)
-		{
-			data[len16++] = (Char16)uc;
-			continue;
-		}
-		uc -= SURROGATE_OFFSET;
-		Char16 low = (Char16)(SURROGATE_LOW_VALUE | (uc & SURROGATE_CODEPOINT_MASK));
-		uc >>= SURROGATE_BITS;
-		Char16 high = (Char16)(SURROGATE_HIGH_VALUE | (uc & SURROGATE_CODEPOINT_MASK));
-		data[len16++] = high;
-		data[len16++] = low;
-	}
-	return data[0 .. len16 - 1];
+	usize len16 = conv::utf16len_for_utf8(utf8);
+	Char16* data = allocator.alloc((len16 + 1) * Char16.sizeof)?;
+	conv::utf8to16_unsafe(utf8, data)?;
+	data[len16] = 0;
+	return data[0..len16 - 1];
 }


 fn char[]! utf16to8(Char16[] utf16, Allocator allocator = { null, null })
 {
 	if (!allocator.function) allocator = mem::current_allocator();
-	String str = string::new_with_capacity(utf16.len * 2 + 1, allocator);
-	usize len = utf16.len;
-	for (usize i = 0; i < len; i++)
-	{
-		Char16 high = utf16[i];
-		if (high & SURROGATE_GENERIC_MASK != SURROGATE_HIGH_VALUE)
-		{
-			str.append_char32(high);
-			continue;
-		}
-		// Low surrogate first is an error
-		if (high & SURROGATE_MASK != SURROGATE_HIGH_VALUE) return UnicodeResult.INVALID_UTF16!;
-		// Unmatched high surrogate is an error
-		if (i == len - 1) return UnicodeResult.INVALID_UTF16!;
-
-		Char16 low = utf16[++i];
-		// Unmatched high surrogate, invalid
-		if (low & SURROGATE_MASK != SURROGATE_LOW_VALUE) return UnicodeResult.INVALID_UTF16!;
-
-        // The high bits of the codepoint are the value bits of the high surrogate
-        // The low bits of the codepoint are the value bits of the low surrogate
-        Char32 uc = (high & SURROGATE_CODEPOINT_MASK) << SURROGATE_BITS | (low & SURROGATE_CODEPOINT_MASK) + SURROGATE_OFFSET;
-		str.append_char32(uc);
-	}
-	usize new_len = str.len();
-	ZString zstr = str.copy_zstr();
-	str.destroy();
-	return zstr[0 .. new_len - 1];
+	usize len = conv::utf8len_for_utf16(utf16);
+	char* data = allocator.alloc(len + 1)?;
+	conv::utf16to8_unsafe(utf16, data)?;
+	return data[0 .. len - 1];
 }

 fn char[] copy(char[] s)
--- a/lib/std/core/string_iterator.c3
+++ b/lib/std/core/string_iterator.c3
@@ -18,8 +18,8 @@ fn Char32! StringIterator.next(StringIterator* this)
 	usize len = this.utf8.len;
 	usize current = this.current;
 	if (current >= len) return IteratorResult.NO_MORE_ELEMENT!;
-	int read = (int)(len - current < 4 ? len - current : 4);
-	Char32 res = str::utf8CharTo32(&this.utf8[current], &read)?;
+	usize read = (len - current < 4 ? len - current : 4);
+	Char32 res = conv::utf8_to_char32(&this.utf8[current], &read)?;
 	this.current += read;
 	return res;
 }