From 7bd76c973cb1ae59e41e47359c8544e6b58e4375 Mon Sep 17 00:00:00 2001 From: Christoffer Lerno Date: Fri, 12 Nov 2021 10:06:30 +0100 Subject: [PATCH] Placeholders for builtins. Updated character literal parsing, supporting 1-16 characters. More test cases. --- src/compiler/compiler_internal.h | 7 +- src/compiler/copying.c | 1 + src/compiler/enums.h | 2 + src/compiler/lexer.c | 248 ++++++++++++---------- src/compiler/llvm_codegen_expr.c | 2 + src/compiler/llvm_codegen_stmt.c | 2 + src/compiler/parse_expr.c | 32 ++- src/compiler/parse_stmt.c | 1 + src/compiler/sema_casts.c | 2 + src/compiler/sema_expr.c | 4 + src/compiler/tokens.c | 2 + test/test_suite/strings/literal_errors.c3 | 12 +- 12 files changed, 187 insertions(+), 128 deletions(-) diff --git a/src/compiler/compiler_internal.h b/src/compiler/compiler_internal.h index 98367fe9a..3af171fba 100644 --- a/src/compiler/compiler_internal.h +++ b/src/compiler/compiler_internal.h @@ -908,6 +908,10 @@ typedef struct } ExprLen; +typedef struct +{ + Token identifier; +} ExprBuiltin; struct Expr_ { ExprKind expr_kind : 8; @@ -952,6 +956,7 @@ struct Expr_ ExprFuncBlock expr_block; ExprMacroBlock macro_block; Expr** cond_expr; + ExprBuiltin builtin_expr; }; }; @@ -1313,7 +1318,7 @@ typedef union }; struct { - uint64_t char_value; + Int128 char_value; char width; }; } TokenData; diff --git a/src/compiler/copying.c b/src/compiler/copying.c index c330d07b3..9d6ff1fa7 100644 --- a/src/compiler/copying.c +++ b/src/compiler/copying.c @@ -74,6 +74,7 @@ Expr *copy_expr(Expr *source_expr) case EXPR_FLATPATH: case EXPR_UNDEF: case EXPR_NOP: + case EXPR_BUILTIN: return expr; case EXPR_DECL: MACRO_COPY_DECL(expr->decl_expr); diff --git a/src/compiler/enums.h b/src/compiler/enums.h index 1c0eaec77..a932396b3 100644 --- a/src/compiler/enums.h +++ b/src/compiler/enums.h @@ -173,6 +173,7 @@ typedef enum EXPR_BITACCESS, EXPR_BITASSIGN, EXPR_BINARY, + EXPR_BUILTIN, EXPR_MACRO_BODY_EXPANSION, EXPR_CALL, EXPR_CAST, @@ -332,6 +333,7 @@ typedef enum TOKEN_BIT_XOR_ASSIGN, // ^= TOKEN_DIV_ASSIGN, // /= TOKEN_DOTDOT, // .. + TOKEN_BUILTIN, // $$ TOKEN_ELVIS, // ?: TOKEN_EQEQ, // == TOKEN_GREATER_EQ, // >= diff --git a/src/compiler/lexer.c b/src/compiler/lexer.c index 2d043e7ff..7a5614a62 100644 --- a/src/compiler/lexer.c +++ b/src/compiler/lexer.c @@ -670,135 +670,140 @@ ERROR: add_error_token(lexer, "Invalid UTF-8 sequence."); return -1; } + +/** + * Rules: + * 1. If ASCII or \xAB, accept up to 8 characters (16 with Int128 support), size is char, ushort, uint, ulong, UInt128 + * 2. If UTF8, accept 1 UTF character, size is ushort normally, ulong on wide UTF characters, no additional characters accepted. + * 3. If \uABCD, convert to 16 bits, size is ushort, no additional characters accepted. + * 4. If \U01234567, convert to 32 bits, size is uint, no additional characters accepted. + * + * @param lexer + * @return + */ static inline bool scan_char(Lexer *lexer) { - int width = 0; - uint8_t c; - uint64_t b = 0; - char has_unicode_escape = 0; - bool has_unicode = false; - while ((c = next(lexer)) != '\'') - { - if (has_unicode_escape) - { - return add_error_token(lexer, "Character literals with '\\%c' can only contain one character.", has_unicode_escape); - } - if (has_unicode) - { - return add_error_token(lexer, "A character literal may not contain multiple unicode characters."); - } - if (c == '\0') - { - return add_error_token(lexer, "The character literal did not terminate."); - } - if (width > 7) - { - width++; - continue; - } - if (c >= 0x80) - { - if (width != 0) - { - return add_error_token(lexer, "A multi-character character literal may not contain unicode characters."); - } - int64_t utf8 = scan_utf8(lexer, c); - if (utf8 < 0) return false; - has_unicode = true; - b += utf8; - width = -1; - continue; - } - if (c != '\\') - { - width++; - b <<= 8U; - b += (uint8_t)c; - continue; - } - if (c == '\\') - { - c = next(lexer); - const char *start = lexer->current; - signed char escape = is_valid_escape(c); - if (escape == -1) - { - lexer->lexing_start = start; - return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c); - } - switch (escape) - { - case 'x': - { - int64_t hex = scan_hex_literal(lexer, 2); - if (hex < 0) - { - lexer->lexing_start = start; - // Fix underlining if this is an unfinished escape. - return add_error_token(lexer, "Expected a two character hex value after \\x."); - } - width++; - b <<= 8U; - b += hex; - break; - } - case 'u': - { - if (width) - { - return add_error_token(lexer, "Unicode escapes are not allowed in a multi-character literal."); - } - int64_t hex = scan_hex_literal(lexer, 4); - if (hex < 0) - { - lexer->lexing_start = start; - return add_error_token(lexer, "Expected a four character hex value after \\u."); - } - b <<= 16U; - b += hex; - width = -1; - has_unicode_escape = 'u'; - break; - } - case 'U': - { - if (width) - { - return add_error_token(lexer, "Unicode escapes are not allowed in a multi-character literal."); - } - int64_t hex = scan_hex_literal(lexer, 8); - if (hex < 0) - { - lexer->lexing_start = start; - return add_error_token(lexer, "Expected an eight character hex value after \\U."); - } - width = -1; - b <<= 32U; - b += hex; - has_unicode_escape = 'U'; - break; - } - default: - width++; - b <<= 8U; - b += (uint8_t)escape; - } - } - } - if (width == 0) + // Handle the problem with zero size character literal first. + if (match(lexer, '\'')) { return add_error_token(lexer, "The character literal was empty."); } - if (width > 2 && width != 4 && width != 8) + + int width = 0; + char c; + Int128 b = {}; + + while ((c = next(lexer)) != '\'') { - add_error_token(lexer, "Character literals may only be 1, 2, 4 or 8 characters wide."); + // End of file may occur: + if (c == '\0') return add_error_token(lexer, "The character literal did not terminate."); + // We might exceed the width that we allow. + if (width > 15) return add_error_token(lexer, "The character literal exceeds 16 characters."); + // Handle (expected) utf-8 characters. + if ((unsigned)c >= (unsigned)0x80) + { + if (width != 0) goto UNICODE_IN_MULTI; + const char *start = lexer->current; + int64_t utf8 = scan_utf8(lexer, c); + if (utf8 < 0) return false; + if (!match(lexer, '\'')) + { + if (peek(lexer) == '\0') continue; + lexer->lexing_start = start; + return add_error_token(lexer, "Unicode character literals may only contain one character, " + "please remove the additional ones or use all ASCII."); + } + b.low = utf8; + width = utf8 > 0xffff ? 4 : 2; + goto DONE; + } + // Parse the escape code + signed char escape = ' '; + const char *start = lexer->current; + if (c == '\\') + { + assert(c == '\\'); + c = next(lexer); + escape = is_valid_escape(c); + if (escape == -1) + { + lexer->lexing_start = start - 1; + return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c); + } + } + switch (escape) + { + case 'x': + { + int64_t hex = scan_hex_literal(lexer, 2); + if (hex < 0) + { + lexer->lexing_start = start - 1; + // Fix underlining if this is an unfinished escape. + return add_error_token(lexer, "Expected a two character hex value after \\x."); + } + // We can now reassign c and use the default code. + c = hex; + break; + } + case 'u': + case 'U': + { + // First check that we don't have any characters previous to this one. + if (width != 0) goto UNICODE_IN_MULTI; + int bytes = escape == 'U' ? 4 : 2; + int64_t hex = scan_hex_literal(lexer, bytes * 2); + // The hex parsing may have failed, lacking more hex chars. + if (hex < 0) + { + lexer->lexing_start = start - 1; + return add_error_token(lexer, "Expected %s character hex value after \\%c.", + escape == 'u' ? "a four" : "an eight", escape); + } + // If we don't see the end here, then something is wrong. + if (!match(lexer, '\'')) + { + // It may be the end of the line, if so use the default handling by invoking "continue" + if (peek(lexer) == '\0') continue; + // Otherwise step forward and mark it as an error. + next(lexer); + lexer->lexing_start = lexer->current - 1; + return add_error_token(lexer, + "Character literals with '\\%c' can only contain one character, please remove this one.", + escape); + } + // Assign the value and go to DONE. + b.low = hex; + width = bytes; + goto DONE; + } + case ' ': + // No escape, a regular character. + break; + default: + c = (unsigned char)escape; + break; + } + // Default handling here: + width++; + b = i128_shl64(b, 8); + b = i128_add64(b, (unsigned char)c); } + assert(width > 0 && width <= 16); + if (width > 8 && !platform_target.int128) + { + return add_error_token(lexer, "Character literal exceeded 8 characters."); + } +DONE: add_generic_token(lexer, TOKEN_CHAR_LITERAL); lexer->latest_token_data->char_value = b; - lexer->latest_token_data->width = width < 0 ? 0 : (char)width; + lexer->latest_token_data->width = (char)width; return true; + +UNICODE_IN_MULTI: + return add_error_token(lexer, "A multi-character literal may not contain unicode characters."); } static inline void skip_first_line_if_empty(Lexer *lexer) @@ -1616,6 +1621,15 @@ static bool lexer_scan_token_inner(Lexer *lexer, LexMode mode) return scan_ident(lexer, TOKEN_HASH_IDENT, TOKEN_HASH_CONST_IDENT, TOKEN_HASH_TYPE_IDENT, '$'); case '$': if (match(lexer, '{')) return add_token(lexer, TOKEN_PLACEHOLDER, "${"); + if (match(lexer, '$')) + { + if (is_letter(peek(lexer))) + { + add_token(lexer, TOKEN_BUILTIN, "$$"); + return scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0); + } + return add_error_token(lexer, "Expected a letter after $$."); + } return scan_ident(lexer, TOKEN_CT_IDENT, TOKEN_CT_CONST_IDENT, TOKEN_CT_TYPE_IDENT, '$'); case ',': return add_token(lexer, TOKEN_COMMA, ","); diff --git a/src/compiler/llvm_codegen_expr.c b/src/compiler/llvm_codegen_expr.c index 3e2f03b5b..dfdb64edf 100644 --- a/src/compiler/llvm_codegen_expr.c +++ b/src/compiler/llvm_codegen_expr.c @@ -4898,6 +4898,8 @@ void llvm_emit_expr(GenContext *c, BEValue *value, Expr *expr) case EXPR_UNDEF: // Should never reach this. UNREACHABLE + case EXPR_BUILTIN: + TODO case EXPR_DECL: llvm_emit_local_decl(c, expr->decl_expr); return; diff --git a/src/compiler/llvm_codegen_stmt.c b/src/compiler/llvm_codegen_stmt.c index 86b178bf3..eeab68536 100644 --- a/src/compiler/llvm_codegen_stmt.c +++ b/src/compiler/llvm_codegen_stmt.c @@ -970,6 +970,8 @@ static bool expr_is_pure(Expr *expr) if (!expr) return true; switch (expr->expr_kind) { + case EXPR_BUILTIN: + TODO case EXPR_CONST: case EXPR_CONST_IDENTIFIER: case EXPR_IDENTIFIER: diff --git a/src/compiler/parse_expr.c b/src/compiler/parse_expr.c index ef87b4a73..998b90ad2 100644 --- a/src/compiler/parse_expr.c +++ b/src/compiler/parse_expr.c @@ -924,6 +924,16 @@ static Expr *parse_or_error_expr(Context *context, Expr *left) return else_expr; } +static Expr *parse_builtin(Context *context, Expr *left) +{ + assert(!left && "Had left hand side"); + Expr *expr = EXPR_NEW_TOKEN(EXPR_BUILTIN, context->tok); + advance_and_verify(context, TOKEN_BUILTIN); + expr->builtin_expr.identifier = context->tok; + CONSUME_OR(TOKEN_IDENT, poisoned_expr); + RANGE_EXTEND_PREV(expr); + return expr; +} static Expr *parse_placeholder(Context *context, Expr *left) { assert(!left && "Had left hand side"); @@ -1253,31 +1263,36 @@ static Expr *parse_char_lit(Context *context, Expr *left) Expr *expr_int = EXPR_NEW_TOKEN(EXPR_CONST, context->tok); expr_int->const_expr.is_character = true; TokenData *data = tokendata_from_id(context->tok.id); + expr_int->const_expr.ixx.i = data->char_value; + expr_int->const_expr.narrowable = true; + expr_int->const_expr.const_kind = CONST_INTEGER; switch (data->width) { case 1: - expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U8); expr_int->type = type_char; + expr_int->const_expr.ixx.type = TYPE_U8; break; case 2: - expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U16); expr_int->type = type_ushort; + expr_int->const_expr.ixx.type = TYPE_U16; break; + case 3: case 4: - expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U32); expr_int->type = type_uint; + expr_int->const_expr.ixx.type = TYPE_U32; break; + case 5: + case 6: + case 7: case 8: - expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U64); expr_int->type = type_ulong; + expr_int->const_expr.ixx.type = TYPE_U64; break; default: - expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U64); - expr_int->type = type_long; - expr_int->const_expr.narrowable = true; + expr_int->type = type_u128; + expr_int->const_expr.ixx.type = TYPE_U128; break; } - advance(context); return expr_int; } @@ -1571,6 +1586,7 @@ ParseRule rules[TOKEN_EOF + 1] = { [TOKEN_NULL] = { parse_null, NULL, PREC_NONE }, [TOKEN_INTEGER] = { parse_integer, NULL, PREC_NONE }, [TOKEN_PLACEHOLDER] = { parse_placeholder, NULL, PREC_NONE }, + [TOKEN_BUILTIN] = { parse_builtin, NULL, PREC_NONE }, [TOKEN_CHAR_LITERAL] = { parse_char_lit, NULL, PREC_NONE }, [TOKEN_AT] = { parse_macro_expansion, NULL, PREC_NONE }, [TOKEN_STRING] = { parse_string_literal, NULL, PREC_NONE }, diff --git a/src/compiler/parse_stmt.c b/src/compiler/parse_stmt.c index 5a08dc421..ddb1f821e 100644 --- a/src/compiler/parse_stmt.c +++ b/src/compiler/parse_stmt.c @@ -893,6 +893,7 @@ Ast *parse_stmt(Context *context) case TOKEN_TRY: case TOKEN_CATCH: case TOKEN_BYTES: + case TOKEN_BUILTIN: return parse_expr_stmt(context); case TOKEN_ASSERT: return parse_assert_stmt(context); diff --git a/src/compiler/sema_casts.c b/src/compiler/sema_casts.c index 560255835..9c1f38c43 100644 --- a/src/compiler/sema_casts.c +++ b/src/compiler/sema_casts.c @@ -903,6 +903,7 @@ Expr *recursive_may_narrow_float(Expr *expr, Type *type) case EXPR_NOP: case EXPR_LEN: case EXPR_CATCH: + case EXPR_BUILTIN: UNREACHABLE case EXPR_POST_UNARY: return recursive_may_narrow_float(expr->unary_expr.expr, type); @@ -1054,6 +1055,7 @@ Expr *recursive_may_narrow_int(Expr *expr, Type *type) case EXPR_UNDEF: case EXPR_CT_CALL: case EXPR_NOP: + case EXPR_BUILTIN: UNREACHABLE case EXPR_POST_UNARY: return recursive_may_narrow_int(expr->unary_expr.expr, type); diff --git a/src/compiler/sema_expr.c b/src/compiler/sema_expr.c index 789681350..8a029bbd5 100644 --- a/src/compiler/sema_expr.c +++ b/src/compiler/sema_expr.c @@ -273,6 +273,8 @@ bool expr_is_constant_eval(Expr *expr, ConstantEvalKind eval_kind) RETRY: switch (expr->expr_kind) { + case EXPR_BUILTIN: + return false; case EXPR_BITACCESS: case EXPR_ACCESS: expr = expr->access_expr.parent; @@ -6374,6 +6376,8 @@ static inline bool sema_analyse_expr_dispatch(Context *context, Expr *expr) if (!sema_analyse_var_decl(context, expr->decl_expr, true)) return false; expr->type = expr->decl_expr->type; return true; + case EXPR_BUILTIN: + TODO case EXPR_CT_CALL: return sema_expr_analyse_ct_call(context, expr); case EXPR_HASH_IDENT: diff --git a/src/compiler/tokens.c b/src/compiler/tokens.c index 46135161a..825751821 100644 --- a/src/compiler/tokens.c +++ b/src/compiler/tokens.c @@ -80,6 +80,8 @@ const char *token_type_to_string(TokenType type) return "|="; case TOKEN_BIT_XOR_ASSIGN: return "^="; + case TOKEN_BUILTIN: + return "$$"; case TOKEN_DIV_ASSIGN: return "/="; case TOKEN_DOTDOT: diff --git a/test/test_suite/strings/literal_errors.c3 b/test/test_suite/strings/literal_errors.c3 index f7534246a..876b15195 100644 --- a/test/test_suite/strings/literal_errors.c3 +++ b/test/test_suite/strings/literal_errors.c3 @@ -1,3 +1,5 @@ +// #target: x64-darwin + char bar = '\xaf'; char bar = '\x0F'; @@ -8,7 +10,13 @@ char eofk = '\u233'; // #error: Expected a four char char zab = '\Uaokdokok'; // #error: Expected an eight char zab = '\Uaokdooekfoekfekfkeofkekok'; // #error: Expected an eight -char eofk = '\UaUfko'; // #error: Expected an eight +char eofkq = '\UaUfko'; // #error: Expected an eight -char foo = ' // #error: The character literal did not terminate +char x1 = '\u0023a'; // #error: Character literals with '\u' can only contain one character +char x2 = '\U00000023a'; // #error: Character literals with '\U' can only contain one character +char x = 'รครถ'; // #error: may only contain one character +char feokf = '\9'; // #error: Invalid escape sequence '\9' +char fje = '123456789012345678'; // #error: The character literal exceeds 16 characters. + +char foekfe = ''; // #error: The character literal was empty \ No newline at end of file