From 7bd76c973cb1ae59e41e47359c8544e6b58e4375 Mon Sep 17 00:00:00 2001
From: Christoffer Lerno <christoffer@aegik.com>
Date: Fri, 12 Nov 2021 10:06:30 +0100
Subject: [PATCH] Placeholders for builtins. Updated character literal parsing,
 supporting 1-16 characters. More test cases.

---
 src/compiler/compiler_internal.h          |   7 +-
 src/compiler/copying.c                    |   1 +
 src/compiler/enums.h                      |   2 +
 src/compiler/lexer.c                      | 248 ++++++++++++----------
 src/compiler/llvm_codegen_expr.c          |   2 +
 src/compiler/llvm_codegen_stmt.c          |   2 +
 src/compiler/parse_expr.c                 |  32 ++-
 src/compiler/parse_stmt.c                 |   1 +
 src/compiler/sema_casts.c                 |   2 +
 src/compiler/sema_expr.c                  |   4 +
 src/compiler/tokens.c                     |   2 +
 test/test_suite/strings/literal_errors.c3 |  12 +-
 12 files changed, 187 insertions(+), 128 deletions(-)

diff --git a/src/compiler/compiler_internal.h b/src/compiler/compiler_internal.h
index 98367fe9a..3af171fba 100644
--- a/src/compiler/compiler_internal.h
+++ b/src/compiler/compiler_internal.h
@@ -908,6 +908,10 @@ typedef struct
 } ExprLen;
 
 
+typedef struct
+{
+	Token identifier;
+} ExprBuiltin;
 struct Expr_
 {
 	ExprKind expr_kind : 8;
@@ -952,6 +956,7 @@ struct Expr_
 		ExprFuncBlock expr_block;
 		ExprMacroBlock macro_block;
 		Expr** cond_expr;
+		ExprBuiltin builtin_expr;
 	};
 };
 
@@ -1313,7 +1318,7 @@ typedef union
 	};
 	struct
 	{
-		uint64_t char_value;
+		Int128 char_value;
 		char width;
 	};
 } TokenData;
diff --git a/src/compiler/copying.c b/src/compiler/copying.c
index c330d07b3..9d6ff1fa7 100644
--- a/src/compiler/copying.c
+++ b/src/compiler/copying.c
@@ -74,6 +74,7 @@ Expr *copy_expr(Expr *source_expr)
 		case EXPR_FLATPATH:
 		case EXPR_UNDEF:
 		case EXPR_NOP:
+		case EXPR_BUILTIN:
 			return expr;
 		case EXPR_DECL:
 			MACRO_COPY_DECL(expr->decl_expr);
diff --git a/src/compiler/enums.h b/src/compiler/enums.h
index 1c0eaec77..a932396b3 100644
--- a/src/compiler/enums.h
+++ b/src/compiler/enums.h
@@ -173,6 +173,7 @@ typedef enum
 	EXPR_BITACCESS,
 	EXPR_BITASSIGN,
 	EXPR_BINARY,
+	EXPR_BUILTIN,
 	EXPR_MACRO_BODY_EXPANSION,
 	EXPR_CALL,
 	EXPR_CAST,
@@ -332,6 +333,7 @@ typedef enum
 	TOKEN_BIT_XOR_ASSIGN,   // ^=
 	TOKEN_DIV_ASSIGN,       // /=
 	TOKEN_DOTDOT,           // ..
+	TOKEN_BUILTIN,          // $$
 	TOKEN_ELVIS,            // ?:
 	TOKEN_EQEQ,             // ==
 	TOKEN_GREATER_EQ,       // >=
diff --git a/src/compiler/lexer.c b/src/compiler/lexer.c
index 2d043e7ff..7a5614a62 100644
--- a/src/compiler/lexer.c
+++ b/src/compiler/lexer.c
@@ -670,135 +670,140 @@ ERROR:
 	add_error_token(lexer, "Invalid UTF-8 sequence.");
 	return -1;
 }
+
+/**
+ * Rules:
+ * 1. If ASCII or \xAB, accept up to 8 characters (16 with Int128 support), size is char, ushort, uint, ulong, UInt128
+ * 2. If UTF8, accept 1 UTF character, size is ushort normally, ulong on wide UTF characters, no additional characters accepted.
+ * 3. If \uABCD, convert to 16 bits, size is ushort, no additional characters accepted.
+ * 4. If \U01234567, convert to 32 bits, size is uint, no additional characters accepted.
+ *
+ * @param lexer
+ * @return
+ */
 static inline bool scan_char(Lexer *lexer)
 {
-	int width = 0;
-	uint8_t c;
-	uint64_t b = 0;
-	char has_unicode_escape = 0;
-	bool has_unicode = false;
-	while ((c = next(lexer)) != '\'')
-	{
-		if (has_unicode_escape)
-		{
-			return add_error_token(lexer, "Character literals with '\\%c' can only contain one character.", has_unicode_escape);
-		}
-		if (has_unicode)
-		{
-			return add_error_token(lexer, "A character literal may not contain multiple unicode characters.");
-		}
-		if (c == '\0')
-		{
-			return add_error_token(lexer, "The character literal did not terminate.");
-		}
-		if (width > 7)
-		{
-			width++;
-			continue;
-		}
-		if (c >= 0x80)
-		{
-			if (width != 0)
-			{
-				return add_error_token(lexer, "A multi-character character literal may not contain unicode characters.");
-			}
-			int64_t utf8 = scan_utf8(lexer, c);
-			if (utf8 < 0) return false;
-			has_unicode = true;
-			b += utf8;
-			width = -1;
-			continue;
-		}
-		if (c != '\\')
-		{
-			width++;
-			b <<= 8U;
-			b += (uint8_t)c;
-			continue;
-		}
-		if (c == '\\')
-		{
-			c = next(lexer);
-			const char *start = lexer->current;
-			signed char escape = is_valid_escape(c);
-			if (escape == -1)
-			{
-				lexer->lexing_start = start;
-				return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
-			}
-			switch (escape)
-			{
-				case 'x':
-				{
-					int64_t hex = scan_hex_literal(lexer, 2);
-					if (hex < 0)
-					{
-						lexer->lexing_start = start;
-						// Fix underlining if this is an unfinished escape.
-						return add_error_token(lexer, "Expected a two character hex value after \\x.");
-					}
-					width++;
-					b <<= 8U;
-					b += hex;
-					break;
-				}
-				case 'u':
-				{
-					if (width)
-					{
-						return add_error_token(lexer, "Unicode escapes are not allowed in a multi-character literal.");
-					}
-					int64_t hex = scan_hex_literal(lexer, 4);
-					if (hex < 0)
-					{
-						lexer->lexing_start = start;
-						return add_error_token(lexer, "Expected a four character hex value after \\u.");
-					}
-					b <<= 16U;
-					b += hex;
-					width = -1;
-					has_unicode_escape = 'u';
-					break;
-				}
-				case 'U':
-				{
-					if (width)
-					{
-						return add_error_token(lexer, "Unicode escapes are not allowed in a multi-character literal.");
-					}
-					int64_t hex = scan_hex_literal(lexer, 8);
-					if (hex < 0)
-					{
-						lexer->lexing_start = start;
-						return add_error_token(lexer, "Expected an eight character hex value after \\U.");
-					}
-					width = -1;
-					b <<= 32U;
-					b += hex;
-					has_unicode_escape = 'U';
-					break;
-				}
-				default:
-					width++;
-					b <<= 8U;
-					b += (uint8_t)escape;
-			}
-		}
-	}
 
-	if (width == 0)
+	// Handle the problem with zero size character literal first.
+	if (match(lexer, '\''))
 	{
 		return add_error_token(lexer, "The character literal was empty.");
 	}
-	if (width > 2 && width != 4 && width != 8)
+
+	int width = 0;
+	char c;
+	Int128 b = {};
+
+	while ((c = next(lexer)) != '\'')
 	{
-		add_error_token(lexer, "Character literals may only be 1, 2, 4 or 8 characters wide.");
+		// End of file may occur:
+		if (c == '\0') return add_error_token(lexer, "The character literal did not terminate.");
+		// We might exceed the width that we allow.
+		if (width > 15) return add_error_token(lexer, "The character literal exceeds 16 characters.");
+		// Handle (expected) utf-8 characters.
+		if ((unsigned)c >= (unsigned)0x80)
+		{
+			if (width != 0) goto UNICODE_IN_MULTI;
+			const char *start = lexer->current;
+			int64_t utf8 = scan_utf8(lexer, c);
+			if (utf8 < 0) return false;
+			if (!match(lexer, '\''))
+			{
+				if (peek(lexer) == '\0') continue;
+				lexer->lexing_start = start;
+				return add_error_token(lexer, "Unicode character literals may only contain one character, "
+											  "please remove the additional ones or use all ASCII.");
+			}
+			b.low = utf8;
+			width = utf8 > 0xffff ? 4 : 2;
+			goto DONE;
+		}
+		// Parse the escape code
+		signed char escape = ' ';
+		const char *start = lexer->current;
+		if (c == '\\')
+		{
+			assert(c == '\\');
+			c = next(lexer);
+			escape = is_valid_escape(c);
+			if (escape == -1)
+			{
+				lexer->lexing_start = start - 1;
+				return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
+			}
+		}
+		switch (escape)
+		{
+			case 'x':
+			{
+				int64_t hex = scan_hex_literal(lexer, 2);
+				if (hex < 0)
+				{
+					lexer->lexing_start = start - 1;
+					// Fix underlining if this is an unfinished escape.
+					return add_error_token(lexer, "Expected a two character hex value after \\x.");
+				}
+				// We can now reassign c and use the default code.
+				c = hex;
+				break;
+			}
+			case 'u':
+			case 'U':
+			{
+				// First check that we don't have any characters previous to this one.
+				if (width != 0) goto UNICODE_IN_MULTI;
+				int bytes = escape == 'U' ? 4 : 2;
+				int64_t hex = scan_hex_literal(lexer, bytes * 2);
+				// The hex parsing may have failed, lacking more hex chars.
+				if (hex < 0)
+				{
+					lexer->lexing_start = start - 1;
+					return add_error_token(lexer, "Expected %s character hex value after \\%c.",
+										   escape == 'u' ? "a four" : "an eight", escape);
+				}
+				// If we don't see the end here, then something is wrong.
+				if (!match(lexer, '\''))
+				{
+					// It may be the end of the line, if so use the default handling by invoking "continue"
+					if (peek(lexer) == '\0') continue;
+					// Otherwise step forward and mark it as an error.
+					next(lexer);
+					lexer->lexing_start = lexer->current - 1;
+					return add_error_token(lexer,
+					                       "Character literals with '\\%c' can only contain one character, please remove this one.",
+					                       escape);
+				}
+				// Assign the value and go to DONE.
+				b.low = hex;
+				width = bytes;
+				goto DONE;
+			}
+			case ' ':
+				// No escape, a regular character.
+				break;
+			default:
+				c = (unsigned char)escape;
+				break;
+		}
+		// Default handling here:
+		width++;
+		b = i128_shl64(b, 8);
+		b = i128_add64(b, (unsigned char)c);
 	}
 
+	assert(width > 0 && width <= 16);
+	if (width > 8 && !platform_target.int128)
+	{
+		return add_error_token(lexer, "Character literal exceeded 8 characters.");
+	}
+DONE:
 	add_generic_token(lexer, TOKEN_CHAR_LITERAL);
 	lexer->latest_token_data->char_value = b;
-	lexer->latest_token_data->width = width < 0 ? 0 : (char)width;
+	lexer->latest_token_data->width = (char)width;
 	return true;
+
+UNICODE_IN_MULTI:
+	return add_error_token(lexer, "A multi-character literal may not contain unicode characters.");
 }
 
 static inline void skip_first_line_if_empty(Lexer *lexer)
@@ -1616,6 +1621,15 @@ static bool lexer_scan_token_inner(Lexer *lexer, LexMode mode)
 			return scan_ident(lexer, TOKEN_HASH_IDENT, TOKEN_HASH_CONST_IDENT, TOKEN_HASH_TYPE_IDENT, '$');
 		case '$':
 			if (match(lexer, '{')) return add_token(lexer, TOKEN_PLACEHOLDER, "${");
+			if (match(lexer, '$'))
+			{
+				if (is_letter(peek(lexer)))
+				{
+					add_token(lexer, TOKEN_BUILTIN, "$$");
+					return scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
+				}
+				return add_error_token(lexer, "Expected a letter after $$.");
+			}
 			return scan_ident(lexer, TOKEN_CT_IDENT, TOKEN_CT_CONST_IDENT, TOKEN_CT_TYPE_IDENT, '$');
 		case ',':
 			return add_token(lexer, TOKEN_COMMA, ",");
diff --git a/src/compiler/llvm_codegen_expr.c b/src/compiler/llvm_codegen_expr.c
index 3e2f03b5b..dfdb64edf 100644
--- a/src/compiler/llvm_codegen_expr.c
+++ b/src/compiler/llvm_codegen_expr.c
@@ -4898,6 +4898,8 @@ void llvm_emit_expr(GenContext *c, BEValue *value, Expr *expr)
 		case EXPR_UNDEF:
 			// Should never reach this.
 			UNREACHABLE
+		case EXPR_BUILTIN:
+			TODO
 		case EXPR_DECL:
 			llvm_emit_local_decl(c, expr->decl_expr);
 			return;
diff --git a/src/compiler/llvm_codegen_stmt.c b/src/compiler/llvm_codegen_stmt.c
index 86b178bf3..eeab68536 100644
--- a/src/compiler/llvm_codegen_stmt.c
+++ b/src/compiler/llvm_codegen_stmt.c
@@ -970,6 +970,8 @@ static bool expr_is_pure(Expr *expr)
 	if (!expr) return true;
 	switch (expr->expr_kind)
 	{
+		case EXPR_BUILTIN:
+			TODO
 		case EXPR_CONST:
 		case EXPR_CONST_IDENTIFIER:
 		case EXPR_IDENTIFIER:
diff --git a/src/compiler/parse_expr.c b/src/compiler/parse_expr.c
index ef87b4a73..998b90ad2 100644
--- a/src/compiler/parse_expr.c
+++ b/src/compiler/parse_expr.c
@@ -924,6 +924,16 @@ static Expr *parse_or_error_expr(Context *context, Expr *left)
 	return else_expr;
 }
 
+static Expr *parse_builtin(Context *context, Expr *left)
+{
+	assert(!left && "Had left hand side");
+	Expr *expr = EXPR_NEW_TOKEN(EXPR_BUILTIN, context->tok);
+	advance_and_verify(context, TOKEN_BUILTIN);
+	expr->builtin_expr.identifier = context->tok;
+	CONSUME_OR(TOKEN_IDENT, poisoned_expr);
+	RANGE_EXTEND_PREV(expr);
+	return expr;
+}
 static Expr *parse_placeholder(Context *context, Expr *left)
 {
 	assert(!left && "Had left hand side");
@@ -1253,31 +1263,36 @@ static Expr *parse_char_lit(Context *context, Expr *left)
 	Expr *expr_int = EXPR_NEW_TOKEN(EXPR_CONST, context->tok);
 	expr_int->const_expr.is_character = true;
 	TokenData *data = tokendata_from_id(context->tok.id);
+	expr_int->const_expr.ixx.i = data->char_value;
+	expr_int->const_expr.narrowable = true;
+	expr_int->const_expr.const_kind = CONST_INTEGER;
 	switch (data->width)
 	{
 		case 1:
-			expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U8);
 			expr_int->type = type_char;
+			expr_int->const_expr.ixx.type = TYPE_U8;
 			break;
 		case 2:
-			expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U16);
 			expr_int->type = type_ushort;
+			expr_int->const_expr.ixx.type = TYPE_U16;
 			break;
+		case 3:
 		case 4:
-			expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U32);
 			expr_int->type = type_uint;
+			expr_int->const_expr.ixx.type = TYPE_U32;
 			break;
+		case 5:
+		case 6:
+		case 7:
 		case 8:
-			expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U64);
 			expr_int->type = type_ulong;
+			expr_int->const_expr.ixx.type = TYPE_U64;
 			break;
 		default:
-			expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U64);
-			expr_int->type = type_long;
-			expr_int->const_expr.narrowable = true;
+			expr_int->type = type_u128;
+			expr_int->const_expr.ixx.type = TYPE_U128;
 			break;
 	}
-
 	advance(context);
 	return expr_int;
 }
@@ -1571,6 +1586,7 @@ ParseRule rules[TOKEN_EOF + 1] = {
 		[TOKEN_NULL] = { parse_null, NULL, PREC_NONE },
 		[TOKEN_INTEGER] = { parse_integer, NULL, PREC_NONE },
 		[TOKEN_PLACEHOLDER] = { parse_placeholder, NULL, PREC_NONE },
+		[TOKEN_BUILTIN] = { parse_builtin, NULL, PREC_NONE },
 		[TOKEN_CHAR_LITERAL] = { parse_char_lit, NULL, PREC_NONE },
 		[TOKEN_AT] = { parse_macro_expansion, NULL, PREC_NONE },
 		[TOKEN_STRING] = { parse_string_literal, NULL, PREC_NONE },
diff --git a/src/compiler/parse_stmt.c b/src/compiler/parse_stmt.c
index 5a08dc421..ddb1f821e 100644
--- a/src/compiler/parse_stmt.c
+++ b/src/compiler/parse_stmt.c
@@ -893,6 +893,7 @@ Ast *parse_stmt(Context *context)
 		case TOKEN_TRY:
 		case TOKEN_CATCH:
 		case TOKEN_BYTES:
+		case TOKEN_BUILTIN:
 			return parse_expr_stmt(context);
 		case TOKEN_ASSERT:
 			return parse_assert_stmt(context);
diff --git a/src/compiler/sema_casts.c b/src/compiler/sema_casts.c
index 560255835..9c1f38c43 100644
--- a/src/compiler/sema_casts.c
+++ b/src/compiler/sema_casts.c
@@ -903,6 +903,7 @@ Expr *recursive_may_narrow_float(Expr *expr, Type *type)
 		case EXPR_NOP:
 		case EXPR_LEN:
 		case EXPR_CATCH:
+		case EXPR_BUILTIN:
 			UNREACHABLE
 		case EXPR_POST_UNARY:
 			return recursive_may_narrow_float(expr->unary_expr.expr, type);
@@ -1054,6 +1055,7 @@ Expr *recursive_may_narrow_int(Expr *expr, Type *type)
 		case EXPR_UNDEF:
 		case EXPR_CT_CALL:
 		case EXPR_NOP:
+		case EXPR_BUILTIN:
 			UNREACHABLE
 		case EXPR_POST_UNARY:
 			return recursive_may_narrow_int(expr->unary_expr.expr, type);
diff --git a/src/compiler/sema_expr.c b/src/compiler/sema_expr.c
index 789681350..8a029bbd5 100644
--- a/src/compiler/sema_expr.c
+++ b/src/compiler/sema_expr.c
@@ -273,6 +273,8 @@ bool expr_is_constant_eval(Expr *expr, ConstantEvalKind eval_kind)
 	RETRY:
 	switch (expr->expr_kind)
 	{
+		case EXPR_BUILTIN:
+			return false;
 		case EXPR_BITACCESS:
 		case EXPR_ACCESS:
 			expr = expr->access_expr.parent;
@@ -6374,6 +6376,8 @@ static inline bool sema_analyse_expr_dispatch(Context *context, Expr *expr)
 			if (!sema_analyse_var_decl(context, expr->decl_expr, true)) return false;
 			expr->type = expr->decl_expr->type;
 			return true;
+		case EXPR_BUILTIN:
+			TODO
 		case EXPR_CT_CALL:
 			return sema_expr_analyse_ct_call(context, expr);
 		case EXPR_HASH_IDENT:
diff --git a/src/compiler/tokens.c b/src/compiler/tokens.c
index 46135161a..825751821 100644
--- a/src/compiler/tokens.c
+++ b/src/compiler/tokens.c
@@ -80,6 +80,8 @@ const char *token_type_to_string(TokenType type)
 			return "|=";
 		case TOKEN_BIT_XOR_ASSIGN:
 			return "^=";
+		case TOKEN_BUILTIN:
+			return "$$";
 		case TOKEN_DIV_ASSIGN:
 			return "/=";
 		case TOKEN_DOTDOT:
diff --git a/test/test_suite/strings/literal_errors.c3 b/test/test_suite/strings/literal_errors.c3
index f7534246a..876b15195 100644
--- a/test/test_suite/strings/literal_errors.c3
+++ b/test/test_suite/strings/literal_errors.c3
@@ -1,3 +1,5 @@
+// #target: x64-darwin
+
 char bar = '\xaf';
 char bar = '\x0F';
 
@@ -8,7 +10,13 @@ char eofk = '\u233'; // #error: Expected a four char
 
 char zab = '\Uaokdokok'; // #error: Expected an eight
 char zab = '\Uaokdooekfoekfekfkeofkekok'; // #error: Expected an eight
-char eofk = '\UaUfko'; // #error: Expected an eight
+char eofkq = '\UaUfko'; // #error: Expected an eight
 
-char foo = ' // #error: The character literal did not terminate
+char x1 = '\u0023a'; // #error: Character literals with '\u' can only contain one character
+char x2 = '\U00000023a'; // #error: Character literals with '\U' can only contain one character
 
+char x = 'äö'; // #error: may only contain one character
+char feokf = '\9'; // #error: Invalid escape sequence '\9'
+char fje = '123456789012345678'; // #error: The character literal exceeds 16 characters.
+
+char foekfe = ''; // #error: The character literal was empty
\ No newline at end of file