Placeholders for builtins. Updated character literal parsing, supporting 1-16 characters. More test cases.

This commit is contained in:
Christoffer Lerno
2021-11-12 10:06:30 +01:00
committed by Christoffer Lerno
parent 42465039e9
commit 7bd76c973c
12 changed files with 187 additions and 128 deletions

View File

@@ -908,6 +908,10 @@ typedef struct
} ExprLen;
typedef struct
{
Token identifier;
} ExprBuiltin;
struct Expr_
{
ExprKind expr_kind : 8;
@@ -952,6 +956,7 @@ struct Expr_
ExprFuncBlock expr_block;
ExprMacroBlock macro_block;
Expr** cond_expr;
ExprBuiltin builtin_expr;
};
};
@@ -1313,7 +1318,7 @@ typedef union
};
struct
{
uint64_t char_value;
Int128 char_value;
char width;
};
} TokenData;

View File

@@ -74,6 +74,7 @@ Expr *copy_expr(Expr *source_expr)
case EXPR_FLATPATH:
case EXPR_UNDEF:
case EXPR_NOP:
case EXPR_BUILTIN:
return expr;
case EXPR_DECL:
MACRO_COPY_DECL(expr->decl_expr);

View File

@@ -173,6 +173,7 @@ typedef enum
EXPR_BITACCESS,
EXPR_BITASSIGN,
EXPR_BINARY,
EXPR_BUILTIN,
EXPR_MACRO_BODY_EXPANSION,
EXPR_CALL,
EXPR_CAST,
@@ -332,6 +333,7 @@ typedef enum
TOKEN_BIT_XOR_ASSIGN, // ^=
TOKEN_DIV_ASSIGN, // /=
TOKEN_DOTDOT, // ..
TOKEN_BUILTIN, // $$
TOKEN_ELVIS, // ?:
TOKEN_EQEQ, // ==
TOKEN_GREATER_EQ, // >=

View File

@@ -670,135 +670,140 @@ ERROR:
add_error_token(lexer, "Invalid UTF-8 sequence.");
return -1;
}
/**
* Rules:
* 1. If ASCII or \xAB, accept up to 8 characters (16 with Int128 support), size is char, ushort, uint, ulong, UInt128
* 2. If UTF8, accept 1 UTF character, size is ushort normally, ulong on wide UTF characters, no additional characters accepted.
* 3. If \uABCD, convert to 16 bits, size is ushort, no additional characters accepted.
* 4. If \U01234567, convert to 32 bits, size is uint, no additional characters accepted.
*
* @param lexer
* @return
*/
static inline bool scan_char(Lexer *lexer)
{
int width = 0;
uint8_t c;
uint64_t b = 0;
char has_unicode_escape = 0;
bool has_unicode = false;
while ((c = next(lexer)) != '\'')
{
if (has_unicode_escape)
{
return add_error_token(lexer, "Character literals with '\\%c' can only contain one character.", has_unicode_escape);
}
if (has_unicode)
{
return add_error_token(lexer, "A character literal may not contain multiple unicode characters.");
}
if (c == '\0')
{
return add_error_token(lexer, "The character literal did not terminate.");
}
if (width > 7)
{
width++;
continue;
}
if (c >= 0x80)
{
if (width != 0)
{
return add_error_token(lexer, "A multi-character character literal may not contain unicode characters.");
}
int64_t utf8 = scan_utf8(lexer, c);
if (utf8 < 0) return false;
has_unicode = true;
b += utf8;
width = -1;
continue;
}
if (c != '\\')
{
width++;
b <<= 8U;
b += (uint8_t)c;
continue;
}
if (c == '\\')
{
c = next(lexer);
const char *start = lexer->current;
signed char escape = is_valid_escape(c);
if (escape == -1)
{
lexer->lexing_start = start;
return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
}
switch (escape)
{
case 'x':
{
int64_t hex = scan_hex_literal(lexer, 2);
if (hex < 0)
{
lexer->lexing_start = start;
// Fix underlining if this is an unfinished escape.
return add_error_token(lexer, "Expected a two character hex value after \\x.");
}
width++;
b <<= 8U;
b += hex;
break;
}
case 'u':
{
if (width)
{
return add_error_token(lexer, "Unicode escapes are not allowed in a multi-character literal.");
}
int64_t hex = scan_hex_literal(lexer, 4);
if (hex < 0)
{
lexer->lexing_start = start;
return add_error_token(lexer, "Expected a four character hex value after \\u.");
}
b <<= 16U;
b += hex;
width = -1;
has_unicode_escape = 'u';
break;
}
case 'U':
{
if (width)
{
return add_error_token(lexer, "Unicode escapes are not allowed in a multi-character literal.");
}
int64_t hex = scan_hex_literal(lexer, 8);
if (hex < 0)
{
lexer->lexing_start = start;
return add_error_token(lexer, "Expected an eight character hex value after \\U.");
}
width = -1;
b <<= 32U;
b += hex;
has_unicode_escape = 'U';
break;
}
default:
width++;
b <<= 8U;
b += (uint8_t)escape;
}
}
}
if (width == 0)
// Handle the problem with zero size character literal first.
if (match(lexer, '\''))
{
return add_error_token(lexer, "The character literal was empty.");
}
if (width > 2 && width != 4 && width != 8)
int width = 0;
char c;
Int128 b = {};
while ((c = next(lexer)) != '\'')
{
add_error_token(lexer, "Character literals may only be 1, 2, 4 or 8 characters wide.");
// End of file may occur:
if (c == '\0') return add_error_token(lexer, "The character literal did not terminate.");
// We might exceed the width that we allow.
if (width > 15) return add_error_token(lexer, "The character literal exceeds 16 characters.");
// Handle (expected) utf-8 characters.
if ((unsigned)c >= (unsigned)0x80)
{
if (width != 0) goto UNICODE_IN_MULTI;
const char *start = lexer->current;
int64_t utf8 = scan_utf8(lexer, c);
if (utf8 < 0) return false;
if (!match(lexer, '\''))
{
if (peek(lexer) == '\0') continue;
lexer->lexing_start = start;
return add_error_token(lexer, "Unicode character literals may only contain one character, "
"please remove the additional ones or use all ASCII.");
}
b.low = utf8;
width = utf8 > 0xffff ? 4 : 2;
goto DONE;
}
// Parse the escape code
signed char escape = ' ';
const char *start = lexer->current;
if (c == '\\')
{
assert(c == '\\');
c = next(lexer);
escape = is_valid_escape(c);
if (escape == -1)
{
lexer->lexing_start = start - 1;
return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
}
}
switch (escape)
{
case 'x':
{
int64_t hex = scan_hex_literal(lexer, 2);
if (hex < 0)
{
lexer->lexing_start = start - 1;
// Fix underlining if this is an unfinished escape.
return add_error_token(lexer, "Expected a two character hex value after \\x.");
}
// We can now reassign c and use the default code.
c = hex;
break;
}
case 'u':
case 'U':
{
// First check that we don't have any characters previous to this one.
if (width != 0) goto UNICODE_IN_MULTI;
int bytes = escape == 'U' ? 4 : 2;
int64_t hex = scan_hex_literal(lexer, bytes * 2);
// The hex parsing may have failed, lacking more hex chars.
if (hex < 0)
{
lexer->lexing_start = start - 1;
return add_error_token(lexer, "Expected %s character hex value after \\%c.",
escape == 'u' ? "a four" : "an eight", escape);
}
// If we don't see the end here, then something is wrong.
if (!match(lexer, '\''))
{
// It may be the end of the line, if so use the default handling by invoking "continue"
if (peek(lexer) == '\0') continue;
// Otherwise step forward and mark it as an error.
next(lexer);
lexer->lexing_start = lexer->current - 1;
return add_error_token(lexer,
"Character literals with '\\%c' can only contain one character, please remove this one.",
escape);
}
// Assign the value and go to DONE.
b.low = hex;
width = bytes;
goto DONE;
}
case ' ':
// No escape, a regular character.
break;
default:
c = (unsigned char)escape;
break;
}
// Default handling here:
width++;
b = i128_shl64(b, 8);
b = i128_add64(b, (unsigned char)c);
}
assert(width > 0 && width <= 16);
if (width > 8 && !platform_target.int128)
{
return add_error_token(lexer, "Character literal exceeded 8 characters.");
}
DONE:
add_generic_token(lexer, TOKEN_CHAR_LITERAL);
lexer->latest_token_data->char_value = b;
lexer->latest_token_data->width = width < 0 ? 0 : (char)width;
lexer->latest_token_data->width = (char)width;
return true;
UNICODE_IN_MULTI:
return add_error_token(lexer, "A multi-character literal may not contain unicode characters.");
}
static inline void skip_first_line_if_empty(Lexer *lexer)
@@ -1616,6 +1621,15 @@ static bool lexer_scan_token_inner(Lexer *lexer, LexMode mode)
return scan_ident(lexer, TOKEN_HASH_IDENT, TOKEN_HASH_CONST_IDENT, TOKEN_HASH_TYPE_IDENT, '$');
case '$':
if (match(lexer, '{')) return add_token(lexer, TOKEN_PLACEHOLDER, "${");
if (match(lexer, '$'))
{
if (is_letter(peek(lexer)))
{
add_token(lexer, TOKEN_BUILTIN, "$$");
return scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
}
return add_error_token(lexer, "Expected a letter after $$.");
}
return scan_ident(lexer, TOKEN_CT_IDENT, TOKEN_CT_CONST_IDENT, TOKEN_CT_TYPE_IDENT, '$');
case ',':
return add_token(lexer, TOKEN_COMMA, ",");

View File

@@ -4898,6 +4898,8 @@ void llvm_emit_expr(GenContext *c, BEValue *value, Expr *expr)
case EXPR_UNDEF:
// Should never reach this.
UNREACHABLE
case EXPR_BUILTIN:
TODO
case EXPR_DECL:
llvm_emit_local_decl(c, expr->decl_expr);
return;

View File

@@ -970,6 +970,8 @@ static bool expr_is_pure(Expr *expr)
if (!expr) return true;
switch (expr->expr_kind)
{
case EXPR_BUILTIN:
TODO
case EXPR_CONST:
case EXPR_CONST_IDENTIFIER:
case EXPR_IDENTIFIER:

View File

@@ -924,6 +924,16 @@ static Expr *parse_or_error_expr(Context *context, Expr *left)
return else_expr;
}
static Expr *parse_builtin(Context *context, Expr *left)
{
assert(!left && "Had left hand side");
Expr *expr = EXPR_NEW_TOKEN(EXPR_BUILTIN, context->tok);
advance_and_verify(context, TOKEN_BUILTIN);
expr->builtin_expr.identifier = context->tok;
CONSUME_OR(TOKEN_IDENT, poisoned_expr);
RANGE_EXTEND_PREV(expr);
return expr;
}
static Expr *parse_placeholder(Context *context, Expr *left)
{
assert(!left && "Had left hand side");
@@ -1253,31 +1263,36 @@ static Expr *parse_char_lit(Context *context, Expr *left)
Expr *expr_int = EXPR_NEW_TOKEN(EXPR_CONST, context->tok);
expr_int->const_expr.is_character = true;
TokenData *data = tokendata_from_id(context->tok.id);
expr_int->const_expr.ixx.i = data->char_value;
expr_int->const_expr.narrowable = true;
expr_int->const_expr.const_kind = CONST_INTEGER;
switch (data->width)
{
case 1:
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U8);
expr_int->type = type_char;
expr_int->const_expr.ixx.type = TYPE_U8;
break;
case 2:
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U16);
expr_int->type = type_ushort;
expr_int->const_expr.ixx.type = TYPE_U16;
break;
case 3:
case 4:
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U32);
expr_int->type = type_uint;
expr_int->const_expr.ixx.type = TYPE_U32;
break;
case 5:
case 6:
case 7:
case 8:
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U64);
expr_int->type = type_ulong;
expr_int->const_expr.ixx.type = TYPE_U64;
break;
default:
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U64);
expr_int->type = type_long;
expr_int->const_expr.narrowable = true;
expr_int->type = type_u128;
expr_int->const_expr.ixx.type = TYPE_U128;
break;
}
advance(context);
return expr_int;
}
@@ -1571,6 +1586,7 @@ ParseRule rules[TOKEN_EOF + 1] = {
[TOKEN_NULL] = { parse_null, NULL, PREC_NONE },
[TOKEN_INTEGER] = { parse_integer, NULL, PREC_NONE },
[TOKEN_PLACEHOLDER] = { parse_placeholder, NULL, PREC_NONE },
[TOKEN_BUILTIN] = { parse_builtin, NULL, PREC_NONE },
[TOKEN_CHAR_LITERAL] = { parse_char_lit, NULL, PREC_NONE },
[TOKEN_AT] = { parse_macro_expansion, NULL, PREC_NONE },
[TOKEN_STRING] = { parse_string_literal, NULL, PREC_NONE },

View File

@@ -893,6 +893,7 @@ Ast *parse_stmt(Context *context)
case TOKEN_TRY:
case TOKEN_CATCH:
case TOKEN_BYTES:
case TOKEN_BUILTIN:
return parse_expr_stmt(context);
case TOKEN_ASSERT:
return parse_assert_stmt(context);

View File

@@ -903,6 +903,7 @@ Expr *recursive_may_narrow_float(Expr *expr, Type *type)
case EXPR_NOP:
case EXPR_LEN:
case EXPR_CATCH:
case EXPR_BUILTIN:
UNREACHABLE
case EXPR_POST_UNARY:
return recursive_may_narrow_float(expr->unary_expr.expr, type);
@@ -1054,6 +1055,7 @@ Expr *recursive_may_narrow_int(Expr *expr, Type *type)
case EXPR_UNDEF:
case EXPR_CT_CALL:
case EXPR_NOP:
case EXPR_BUILTIN:
UNREACHABLE
case EXPR_POST_UNARY:
return recursive_may_narrow_int(expr->unary_expr.expr, type);

View File

@@ -273,6 +273,8 @@ bool expr_is_constant_eval(Expr *expr, ConstantEvalKind eval_kind)
RETRY:
switch (expr->expr_kind)
{
case EXPR_BUILTIN:
return false;
case EXPR_BITACCESS:
case EXPR_ACCESS:
expr = expr->access_expr.parent;
@@ -6374,6 +6376,8 @@ static inline bool sema_analyse_expr_dispatch(Context *context, Expr *expr)
if (!sema_analyse_var_decl(context, expr->decl_expr, true)) return false;
expr->type = expr->decl_expr->type;
return true;
case EXPR_BUILTIN:
TODO
case EXPR_CT_CALL:
return sema_expr_analyse_ct_call(context, expr);
case EXPR_HASH_IDENT:

View File

@@ -80,6 +80,8 @@ const char *token_type_to_string(TokenType type)
return "|=";
case TOKEN_BIT_XOR_ASSIGN:
return "^=";
case TOKEN_BUILTIN:
return "$$";
case TOKEN_DIV_ASSIGN:
return "/=";
case TOKEN_DOTDOT:

View File

@@ -1,3 +1,5 @@
// #target: x64-darwin
char bar = '\xaf';
char bar = '\x0F';
@@ -8,7 +10,13 @@ char eofk = '\u233'; // #error: Expected a four char
char zab = '\Uaokdokok'; // #error: Expected an eight
char zab = '\Uaokdooekfoekfekfkeofkekok'; // #error: Expected an eight
char eofk = '\UaUfko'; // #error: Expected an eight
char eofkq = '\UaUfko'; // #error: Expected an eight
char foo = ' // #error: The character literal did not terminate
char x1 = '\u0023a'; // #error: Character literals with '\u' can only contain one character
char x2 = '\U00000023a'; // #error: Character literals with '\U' can only contain one character
char x = 'äö'; // #error: may only contain one character
char feokf = '\9'; // #error: Invalid escape sequence '\9'
char fje = '123456789012345678'; // #error: The character literal exceeds 16 characters.
char foekfe = ''; // #error: The character literal was empty