diff --git a/lib/std/core/conv.c3 b/lib/std/core/conv.c3 index 2e2b8fc9d..f799cdb12 100644 --- a/lib/std/core/conv.c3 +++ b/lib/std/core/conv.c3 @@ -1,2 +1,406 @@ module std::core::string::conv; +private const uint UTF16_SURROGATE_OFFSET = 0x10000; +private const uint UTF16_SURROGATE_GENERIC_MASK = 0xF800; +private const uint UTF16_SURROGATE_GENERIC_VALUE = 0xD800; +private const uint UTF16_SURROGATE_MASK = 0xFC00; +private const uint UTF16_SURROGATE_CODEPOINT_MASK = 0x03FF; +private const uint UTF16_SURROGATE_BITS = 10; +private const uint UTF16_SURROGATE_LOW_VALUE = 0xDC00; +private const uint UTF16_SURROGATE_HIGH_VALUE = 0xD800; + +/** + * @param c `The utf32 codepoint to convert` + * @param [out] output `the resulting buffer` + * @param [inout] size `the size available` + **/ +fn void! char32_to_utf8(Char32 c, char* output, usize *size) +{ + usize available = *size; + if (!available) return UnicodeResult.CONVERSION_FAILED!; + switch (true) + { + case c < 0x7f: + output[0] = (char)c; + *size = 1; + case c < 0x7ff: + if (available < 2) return UnicodeResult.CONVERSION_FAILED!; + output[0] = (char)(0xC0 | c >> 6); + output[1] = (char)(0x80 | (c & 0x3F)); + *size = 2; + case c < 0xffff: + if (available < 3) return UnicodeResult.CONVERSION_FAILED!; + output[0] = (char)(0xE0 | c >> 12); + output[1] = (char)(0x80 | (c >> 6 & 0x3F)); + output[2] = (char)(0x80 | (c & 0x3F)); + *size = 3; + default: + if (available < 4) return UnicodeResult.CONVERSION_FAILED!; + output[0] = (char)(0xF0 | c >> 18); + output[1] = (char)(0x80 | (c >> 12 & 0x3F)); + output[2] = (char)(0x80 | (c >> 6 & 0x3F)); + output[3] = (char)(0x80 | (c & 0x3F)); + *size = 4; + } +} + +/** + * Convert a code pointer into 1-2 UTF16 characters. + * + * @param c `The character to convert.` + * @param [inout] output `the resulting UTF16 buffer to write to.` + **/ +fn void char32_to_utf16_unsafe(Char32 c, Char16** output) +{ + if (c < UTF16_SURROGATE_OFFSET) + { + (*output)++[0] = (Char16)c; + return; + } + c -= UTF16_SURROGATE_OFFSET; + Char16 low = (Char16)(UTF16_SURROGATE_LOW_VALUE | (c & UTF16_SURROGATE_CODEPOINT_MASK)); + c >>= UTF16_SURROGATE_BITS; + Char16 high = (Char16)(UTF16_SURROGATE_HIGH_VALUE | (c & UTF16_SURROGATE_CODEPOINT_MASK)); + (*output)++[0] = (Char16)high; + (*output)++[0] = (Char16)low; +} + +/** + * Convert 1-2 UTF16 data points into UTF8. + * + * @param [in] ptr `The UTF16 data to convert.` + * @param [inout] available `amount of UTF16 data available.` + * @param [inout] output `the resulting utf8 buffer to write to.` + **/ +fn void! char16_to_utf8_unsafe(Char16 *ptr, usize *available, char** output) +{ + Char16 high = *ptr; + if (high & UTF16_SURROGATE_GENERIC_MASK != UTF16_SURROGATE_GENERIC_VALUE) + { + char32_to_utf8_unsafe(high, output); + *available = 1; + return; + } + // Low surrogate first is an error + if (high & UTF16_SURROGATE_MASK != UTF16_SURROGATE_HIGH_VALUE) return UnicodeResult.INVALID_UTF16!; + + // Unmatched high surrogate is an error + if (*available == 1) return UnicodeResult.INVALID_UTF16!; + + Char16 low = ptr[1]; + + // Unmatched high surrogate, invalid + if (low & UTF16_SURROGATE_MASK != UTF16_SURROGATE_LOW_VALUE) return UnicodeResult.INVALID_UTF16!; + + // The high bits of the codepoint are the value bits of the high surrogate + // The low bits of the codepoint are the value bits of the low surrogate + Char32 uc = (high & UTF16_SURROGATE_CODEPOINT_MASK) << UTF16_SURROGATE_BITS + | (low & UTF16_SURROGATE_CODEPOINT_MASK) + UTF16_SURROGATE_OFFSET; + char32_to_utf8_unsafe(uc, output); + *available = 2; +} +/** + * @param c `The utf32 codepoint to convert` + * @param [inout] output `the resulting buffer` + **/ +fn void char32_to_utf8_unsafe(Char32 c, char** output) +{ + switch (true) + { + case c < 0x7f: + (*output)++[0] = (char)c; + case c < 0x7ff: + (*output)++[0] = (char)(0xC0 | c >> 6); + (*output)++[0] = (char)(0x80 | (c & 0x3F)); + case c < 0xffff: + (*output)++[0] = (char)(0xE0 | c >> 12); + (*output)++[0] = (char)(0x80 | (c >> 6 & 0x3F)); + (*output)++[0] = (char)(0x80 | (c & 0x3F)); + default: + (*output)++[0] = (char)(0xF0 | c >> 18); + (*output)++[0] = (char)(0x80 | (c >> 12 & 0x3F)); + (*output)++[0] = (char)(0x80 | (c >> 6 & 0x3F)); + (*output)++[0] = (char)(0x80 | (c & 0x3F)); + } +} + +/** + * @param [in] ptr `pointer to the first character to parse` + * @param [inout] size `Set to max characters to read, set to characters read` + * @return `the parsed 32 bit codepoint` + **/ +fn Char32! utf8_to_char32(char* ptr, usize* size) +{ + usize max_size = *size; + if (max_size < 1) return UnicodeResult.INVALID_UTF8!; + char c = (ptr++)[0]; + + if ((c & 0x80) == 0) + { + *size = 1; + return c; + } + if ((c & 0xE0) == 0xC0) + { + if (max_size < 2) return UnicodeResult.INVALID_UTF8!; + *size = 2; + Char32 uc = (c & 0x1F) << 6; + c = *ptr; + if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + return uc + c & 0x3F; + } + if ((c & 0xF0) == 0xE0) + { + if (max_size < 3) return UnicodeResult.INVALID_UTF8!; + *size = 3; + Char32 uc = (c & 0x0F) << 12; + c = ptr++[0]; + if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + uc += (c & 0x3F) << 6; + c = ptr++[0]; + if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + return uc + c & 0x3F; + } + if (max_size < 4) return UnicodeResult.INVALID_UTF8!; + *size = 4; + Char32 uc = (c & 0x07) << 18; + c = ptr++[0]; + if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + uc += (c & 0x3F) << 12; + c = ptr++[0]; + if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + uc += (c & 0x3F) << 6; + c = ptr++[0]; + if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; + return uc + c & 0x3F; +} + +/** + * @param utf8 `An UTF-8 encoded slice of bytes` + * @return `the number of encoded code points` + **/ +fn usize utf8_codepoints(char[] utf8) +{ + usize len = 0; + foreach (char c : utf8) + { + if (c & 0xC0 != 0x80) len++; + } + return len; +} + +/** + * Calculate the UTF8 length required to encode an UTF32 array. + * @param [in] utf32 `the utf32 data to calculate from` + * @return `the length of the resulting UTF8 array` + **/ +fn usize utf8len_for_utf32(Char32[] utf32) +{ + usize len = 0; + foreach (Char32 uc : utf32) + { + switch (true) + { + case uc < 0x7f: + len++; + case uc < 0x7ff: + len += 2; + case uc < 0xffff: + len += 3; + default: + len += 4; + } + } + return len; +} + +/** + * Calculate the UTF8 length required to encode an UTF16 array. + * @param [in] utf16 `the utf16 data to calculate from` + * @return `the length of the resulting UTF8 array` + **/ +fn usize utf8len_for_utf16(Char16[] utf16) +{ + usize len = 0; + usize len16 = utf16.len; + for (usize i = 0; i < len16; i++) + { + Char16 c = utf16[i]; + if (c & UTF16_SURROGATE_GENERIC_MASK != UTF16_SURROGATE_GENERIC_VALUE) + { + if (c < 0x7f) + { + len++; + continue; + } + if (c < 0x7ff) + { + len += 2; + continue; + } + len += 3; + continue; + } + len += 4; + } + return len; +} + +/** + * Calculate the UTF16 length required to encode a UTF8 array. + * @param utf8 `the utf8 data to calculate from` + * @return `the length of the resulting UTF16 array` + **/ +fn usize utf16len_for_utf8(char[] utf8) +{ + usize len = utf8.len; + usize len16 = 0; + for (usize i = 0; i < len; i++) + { + len16++; + char c = utf8[i]; + if (c & 0x80 == 0) continue; + i++; + if (c & 0xE0 == 0xC0) continue; + i++; + if (c & 0xF0 == 0xE0) continue; + i++; + len16++; + } + return len16; +} + +/** + * @param [in] utf32 `the UTF32 array to check the length for` + * @return `the required length of an UTF16 array to hold the UTF32 data.` + **/ +fn usize utf16len_for_utf32(Char32[] utf32) +{ + usize len = utf32.len; + foreach (Char32 uc : utf32) + { + if (uc >= UTF16_SURROGATE_OFFSET) len++; + } + return len; +} + +/** + * Convert an UTF32 array to an UTF8 array. + * + * @param [in] utf32 + * @param [out] utf8_buffer + * @return `the number of bytes written.` + **/ +fn usize! utf32to8(Char32[] utf32, char[] utf8_buffer) +{ + usize len = utf8_buffer.len; + char* ptr = utf8_buffer.ptr; + foreach (Char32 uc : utf32) + { + usize size = len; + char32_to_utf8(uc, ptr, &size) @inline?; + len -= size; + ptr += size; + } + return utf8_buffer.len - len; +} + +/** + * Convert an UTF8 array to an UTF32 array. + * + * @param [in] utf8 + * @param [out] utf32_buffer + * @return `the number of Char32s written.` + **/ +fn usize! utf8to32(char[] utf8, Char32[] utf32_buffer) +{ + usize len = utf8.len; + Char32* ptr = utf32_buffer.ptr; + usize len32 = 0; + usize buf_len = utf32_buffer.len; + for (usize i = 0; i < len;) + { + if (len32 == buf_len) return UnicodeResult.CONVERSION_FAILED!; + usize width = len - i; + Char32 uc = utf8_to_char32(&utf8[i], &width) @inline?; + i += width; + ptr[len32++] = uc; + } + return len32; +} + +/** + * Copy an array of UTF16 data into an UTF8 buffer without bounds + * checking. This will assume the buffer is sufficiently large to hold + * the converted data. + * + * @param [in] utf16 `The UTF16 array containing the data to convert.` + * @param [out] utf8_buffer `the (sufficiently large) buffer to hold the UTF16 data.` + **/ +fn void! utf16to8_unsafe(Char16[] utf16, char* utf8_buffer) +{ + usize len16 = utf16.len; + for (usize i = 0; i < len16;) + { + usize available = len16 - i; + char16_to_utf8_unsafe(&utf16[i], &available, &utf8_buffer) @inline?; + i += available; + } +} + +/** + * Copy an array of UTF8 data into an UTF32 buffer without bounds + * checking. This will assume the buffer is sufficiently large to hold + * the converted data. + * + * @param [in] utf8 `The UTF8 buffer containing the data to convert.` + * @param [out] utf32_buffer `the (sufficiently large) buffer to hold the UTF8 data.` + **/ +fn void! utf8to32_unsafe(char[] utf8, Char32* utf32_buffer) +{ + usize len = utf8.len; + for (usize i = 0; i < len;) + { + usize width = len - i; + Char32 uc = utf8_to_char32(&utf8[i], &width) @inline?; + i += width; + (utf32_buffer++)[0] = uc; + } +} + +/** + * Copy an array of UTF8 data into an UTF16 buffer without bounds + * checking. This will assume the buffer is sufficiently large to hold + * the converted data. + * + * @param [in] utf8 `The UTF8 buffer containing the data to convert.` + * @param [out] utf16_buffer `the (sufficiently large) buffer to hold the UTF8 data.` + **/ +fn void! utf8to16_unsafe(char[] utf8, Char16* utf16_buffer) +{ + usize len = utf8.len; + for (usize i = 0; i < len;) + { + usize width = len - i; + Char32 uc = utf8_to_char32(&utf8[i], &width) @inline?; + char32_to_utf16_unsafe(uc, &utf16_buffer) @inline; + i += width; + } +} + +/** + * Copy an array of UTF32 code points into an UTF8 buffer without bounds + * checking. This will assume the buffer is sufficiently large to hold + * the converted data. + * + * @param [in] utf32 `The UTF32 buffer containing the data to convert.` + * @param [out] utf8_buffer `the (sufficiently large) buffer to hold the UTF8 data.` + **/ +fn void utf32to8_unsafe(Char32[] utf32, char* utf8_buffer) +{ + char* start = utf8_buffer; + foreach (Char32 uc : utf32) + { + char32_to_utf8_unsafe(uc, &utf8_buffer) @inline; + } +} diff --git a/lib/std/core/str.c3 b/lib/std/core/str.c3 index beb1c5afa..0191986e6 100644 --- a/lib/std/core/str.c3 +++ b/lib/std/core/str.c3 @@ -1,5 +1,4 @@ module std::core::str; - define ZString = distinct char*; define Char32 = uint; define Char16 = ushort; @@ -53,86 +52,10 @@ fault UnicodeResult { INVALID_UTF8, INVALID_UTF16, + CONVERSION_FAILED, } -/** - * @param c `The utf32 codepoint to convert` - * @param [out] output `the resulting buffer` - * - * @return `the number of characters written 1-4` - **/ -fn char char32_to_utf8(Char32 c, char* output) -{ - if (c < 0x7f) - { - output[0] = (char)c; - return 1; - } - if (c < 0x7ff) - { - output[0] = (char)(0xC0 | c >> 6); - output[1] = (char)(0x80 | (c & 0x3F)); - return 2; - } - if (c < 0xffff) - { - output[0] = (char)(0xE0 | c >> 12); - output[1] = (char)(0x80 | (c >> 6 & 0x3F)); - output[2] = (char)(0x80 | (c & 0x3F)); - return 3; - } - output[0] = (char)(0xF0 | c >> 18); - output[1] = (char)(0x80 | (c >> 12 & 0x3F)); - output[2] = (char)(0x80 | (c >> 6 & 0x3F)); - output[3] = (char)(0x80 | (c & 0x3F)); - return 4; -} -fn Char32! utf8CharTo32(char* ptr, int* size) -{ - int max_size = *size; - if (max_size < 1) return UnicodeResult.INVALID_UTF8!; - char c = (ptr++)[0]; - - if ((c & 0x80) == 0) - { - *size = 1; - return c; - } - if ((c & 0xE0) == 0xC0) - { - if (max_size < 2) return UnicodeResult.INVALID_UTF8!; - *size = 2; - Char32 uc = (c & 0x1F) << 6; - c = *ptr; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; - return uc + c & 0x3F; - } - if ((c & 0xF0) == 0xE0) - { - if (max_size < 3) return UnicodeResult.INVALID_UTF8!; - *size = 3; - Char32 uc = (c & 0x0F) << 12; - c = ptr++[0]; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; - uc += (c & 0x3F) << 6; - c = ptr++[0]; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; - return uc + c & 0x3F; - } - if (max_size < 4) return UnicodeResult.INVALID_UTF8!; - *size = 4; - Char32 uc = (c & 0x07) << 18; - c = ptr++[0]; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; - uc += (c & 0x3F) << 12; - c = ptr++[0]; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; - uc += (c & 0x3F) << 6; - c = ptr++[0]; - if (c & 0xC0 != 0x80) return UnicodeResult.INVALID_UTF8!; - return uc + c & 0x3F; -} fn usize utf8_codepoints(char[] utf8) { @@ -147,78 +70,41 @@ fn usize utf8_codepoints(char[] utf8) fn Char32[]! utf8to32(char[] utf8, Allocator allocator = { null, null }) { if (!allocator.function) allocator = mem::current_allocator(); - usize len = utf8.len; - Char32* data = allocator.alloc((len + 1) * Char32.sizeof)?; - usize len32 = 0; - for (usize i = 0; i < len;) - { - int width = (int)min(len - i, 4); - Char32 uc = utf8CharTo32(&utf8[i], &width) @inline?; - i += width; - data[len32++] = uc; - } - return data[0 .. len32 - 1]; + usize codepoints = conv::utf8_codepoints(utf8); + Char32* data = allocator.alloc(Char32.sizeof * (codepoints + 1))?; + conv::utf8to32_unsafe(utf8, data)?; + data[codepoints] = 0; + return data[0..codepoints - 1]; } +fn char[] utf32to8(Char32[] utf32, Allocator allocator = { null, null }) +{ + usize len = conv::utf8len_for_utf32(utf32); + if (!allocator.function) allocator = mem::current_allocator(); + char* data = allocator.alloc(len + 1)!!; + conv::utf32to8_unsafe(utf32, data); + data[len] = 0; + return data[0..len - 1]; +} fn Char16[]! utf8to16(char[] utf8, Allocator allocator = { null, null }) { if (!allocator.function) allocator = mem::current_allocator(); - usize len = utf8.len; - Char16* data = allocator.alloc((len + 1) * Char16.sizeof)?; - usize len16 = 0; - for (usize i = 0; i < len;) - { - int width = (int)min(len - i, 4); - Char32 uc = utf8CharTo32(&utf8[i], &width) @inline?; - i += width; - if (uc <= 0xFFFF) - { - data[len16++] = (Char16)uc; - continue; - } - uc -= SURROGATE_OFFSET; - Char16 low = (Char16)(SURROGATE_LOW_VALUE | (uc & SURROGATE_CODEPOINT_MASK)); - uc >>= SURROGATE_BITS; - Char16 high = (Char16)(SURROGATE_HIGH_VALUE | (uc & SURROGATE_CODEPOINT_MASK)); - data[len16++] = high; - data[len16++] = low; - } - return data[0 .. len16 - 1]; + usize len16 = conv::utf16len_for_utf8(utf8); + Char16* data = allocator.alloc((len16 + 1) * Char16.sizeof)?; + conv::utf8to16_unsafe(utf8, data)?; + data[len16] = 0; + return data[0..len16 - 1]; } fn char[]! utf16to8(Char16[] utf16, Allocator allocator = { null, null }) { if (!allocator.function) allocator = mem::current_allocator(); - String str = string::new_with_capacity(utf16.len * 2 + 1, allocator); - usize len = utf16.len; - for (usize i = 0; i < len; i++) - { - Char16 high = utf16[i]; - if (high & SURROGATE_GENERIC_MASK != SURROGATE_HIGH_VALUE) - { - str.append_char32(high); - continue; - } - // Low surrogate first is an error - if (high & SURROGATE_MASK != SURROGATE_HIGH_VALUE) return UnicodeResult.INVALID_UTF16!; - // Unmatched high surrogate is an error - if (i == len - 1) return UnicodeResult.INVALID_UTF16!; - - Char16 low = utf16[++i]; - // Unmatched high surrogate, invalid - if (low & SURROGATE_MASK != SURROGATE_LOW_VALUE) return UnicodeResult.INVALID_UTF16!; - - // The high bits of the codepoint are the value bits of the high surrogate - // The low bits of the codepoint are the value bits of the low surrogate - Char32 uc = (high & SURROGATE_CODEPOINT_MASK) << SURROGATE_BITS | (low & SURROGATE_CODEPOINT_MASK) + SURROGATE_OFFSET; - str.append_char32(uc); - } - usize new_len = str.len(); - ZString zstr = str.copy_zstr(); - str.destroy(); - return zstr[0 .. new_len - 1]; + usize len = conv::utf8len_for_utf16(utf16); + char* data = allocator.alloc(len + 1)?; + conv::utf16to8_unsafe(utf16, data)?; + return data[0 .. len - 1]; } fn char[] copy(char[] s) diff --git a/lib/std/core/string_iterator.c3 b/lib/std/core/string_iterator.c3 index d3d5a1cd6..a4e5fc663 100644 --- a/lib/std/core/string_iterator.c3 +++ b/lib/std/core/string_iterator.c3 @@ -18,8 +18,8 @@ fn Char32! StringIterator.next(StringIterator* this) usize len = this.utf8.len; usize current = this.current; if (current >= len) return IteratorResult.NO_MORE_ELEMENT!; - int read = (int)(len - current < 4 ? len - current : 4); - Char32 res = str::utf8CharTo32(&this.utf8[current], &read)?; + usize read = (len - current < 4 ? len - current : 4); + Char32 res = conv::utf8_to_char32(&this.utf8[current], &read)?; this.current += read; return res; } \ No newline at end of file diff --git a/src/compiler/sema_decls.c b/src/compiler/sema_decls.c index 13e012c2a..bc358750a 100644 --- a/src/compiler/sema_decls.c +++ b/src/compiler/sema_decls.c @@ -1508,7 +1508,9 @@ static inline bool sema_analyse_doc_header(AstId doc, Decl **params, Decl **extr SEMA_ERROR(&directive->doc_stmt.param, "There is no parameter '%s', did you misspell it?", param_name); return false; NEXT:; - bool may_be_pointer = !param->type || type_is_pointer(type_flatten(param->type)); + Type *type = param->type; + if (type) type = type_flatten(type); + bool may_be_pointer = !type || type_is_pointer(type); if (directive->doc_stmt.param.by_ref) { if (!may_be_pointer) @@ -1531,9 +1533,9 @@ static inline bool sema_analyse_doc_header(AstId doc, Decl **params, Decl **extr case PARAM_INOUT: break; } - if (!may_be_pointer) + if (!may_be_pointer && type->type_kind != TYPE_SUBARRAY) { - SEMA_ERROR(directive, "'in', 'out' and 'inout' may only be added to pointers."); + SEMA_ERROR(directive, "'in', 'out' and 'inout' may only be added to pointers and subarrays."); return false; } ADDED:;