mirror of
https://github.com/c3lang/c3c.git
synced 2026-02-27 12:01:16 +00:00
Placeholders for builtins. Updated character literal parsing, supporting 1-16 characters. More test cases.
This commit is contained in:
committed by
Christoffer Lerno
parent
42465039e9
commit
7bd76c973c
@@ -908,6 +908,10 @@ typedef struct
|
||||
} ExprLen;
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
Token identifier;
|
||||
} ExprBuiltin;
|
||||
struct Expr_
|
||||
{
|
||||
ExprKind expr_kind : 8;
|
||||
@@ -952,6 +956,7 @@ struct Expr_
|
||||
ExprFuncBlock expr_block;
|
||||
ExprMacroBlock macro_block;
|
||||
Expr** cond_expr;
|
||||
ExprBuiltin builtin_expr;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -1313,7 +1318,7 @@ typedef union
|
||||
};
|
||||
struct
|
||||
{
|
||||
uint64_t char_value;
|
||||
Int128 char_value;
|
||||
char width;
|
||||
};
|
||||
} TokenData;
|
||||
|
||||
@@ -74,6 +74,7 @@ Expr *copy_expr(Expr *source_expr)
|
||||
case EXPR_FLATPATH:
|
||||
case EXPR_UNDEF:
|
||||
case EXPR_NOP:
|
||||
case EXPR_BUILTIN:
|
||||
return expr;
|
||||
case EXPR_DECL:
|
||||
MACRO_COPY_DECL(expr->decl_expr);
|
||||
|
||||
@@ -173,6 +173,7 @@ typedef enum
|
||||
EXPR_BITACCESS,
|
||||
EXPR_BITASSIGN,
|
||||
EXPR_BINARY,
|
||||
EXPR_BUILTIN,
|
||||
EXPR_MACRO_BODY_EXPANSION,
|
||||
EXPR_CALL,
|
||||
EXPR_CAST,
|
||||
@@ -332,6 +333,7 @@ typedef enum
|
||||
TOKEN_BIT_XOR_ASSIGN, // ^=
|
||||
TOKEN_DIV_ASSIGN, // /=
|
||||
TOKEN_DOTDOT, // ..
|
||||
TOKEN_BUILTIN, // $$
|
||||
TOKEN_ELVIS, // ?:
|
||||
TOKEN_EQEQ, // ==
|
||||
TOKEN_GREATER_EQ, // >=
|
||||
|
||||
@@ -670,135 +670,140 @@ ERROR:
|
||||
add_error_token(lexer, "Invalid UTF-8 sequence.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Rules:
|
||||
* 1. If ASCII or \xAB, accept up to 8 characters (16 with Int128 support), size is char, ushort, uint, ulong, UInt128
|
||||
* 2. If UTF8, accept 1 UTF character, size is ushort normally, ulong on wide UTF characters, no additional characters accepted.
|
||||
* 3. If \uABCD, convert to 16 bits, size is ushort, no additional characters accepted.
|
||||
* 4. If \U01234567, convert to 32 bits, size is uint, no additional characters accepted.
|
||||
*
|
||||
* @param lexer
|
||||
* @return
|
||||
*/
|
||||
static inline bool scan_char(Lexer *lexer)
|
||||
{
|
||||
int width = 0;
|
||||
uint8_t c;
|
||||
uint64_t b = 0;
|
||||
char has_unicode_escape = 0;
|
||||
bool has_unicode = false;
|
||||
while ((c = next(lexer)) != '\'')
|
||||
{
|
||||
if (has_unicode_escape)
|
||||
{
|
||||
return add_error_token(lexer, "Character literals with '\\%c' can only contain one character.", has_unicode_escape);
|
||||
}
|
||||
if (has_unicode)
|
||||
{
|
||||
return add_error_token(lexer, "A character literal may not contain multiple unicode characters.");
|
||||
}
|
||||
if (c == '\0')
|
||||
{
|
||||
return add_error_token(lexer, "The character literal did not terminate.");
|
||||
}
|
||||
if (width > 7)
|
||||
{
|
||||
width++;
|
||||
continue;
|
||||
}
|
||||
if (c >= 0x80)
|
||||
{
|
||||
if (width != 0)
|
||||
{
|
||||
return add_error_token(lexer, "A multi-character character literal may not contain unicode characters.");
|
||||
}
|
||||
int64_t utf8 = scan_utf8(lexer, c);
|
||||
if (utf8 < 0) return false;
|
||||
has_unicode = true;
|
||||
b += utf8;
|
||||
width = -1;
|
||||
continue;
|
||||
}
|
||||
if (c != '\\')
|
||||
{
|
||||
width++;
|
||||
b <<= 8U;
|
||||
b += (uint8_t)c;
|
||||
continue;
|
||||
}
|
||||
if (c == '\\')
|
||||
{
|
||||
c = next(lexer);
|
||||
const char *start = lexer->current;
|
||||
signed char escape = is_valid_escape(c);
|
||||
if (escape == -1)
|
||||
{
|
||||
lexer->lexing_start = start;
|
||||
return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
|
||||
}
|
||||
switch (escape)
|
||||
{
|
||||
case 'x':
|
||||
{
|
||||
int64_t hex = scan_hex_literal(lexer, 2);
|
||||
if (hex < 0)
|
||||
{
|
||||
lexer->lexing_start = start;
|
||||
// Fix underlining if this is an unfinished escape.
|
||||
return add_error_token(lexer, "Expected a two character hex value after \\x.");
|
||||
}
|
||||
width++;
|
||||
b <<= 8U;
|
||||
b += hex;
|
||||
break;
|
||||
}
|
||||
case 'u':
|
||||
{
|
||||
if (width)
|
||||
{
|
||||
return add_error_token(lexer, "Unicode escapes are not allowed in a multi-character literal.");
|
||||
}
|
||||
int64_t hex = scan_hex_literal(lexer, 4);
|
||||
if (hex < 0)
|
||||
{
|
||||
lexer->lexing_start = start;
|
||||
return add_error_token(lexer, "Expected a four character hex value after \\u.");
|
||||
}
|
||||
b <<= 16U;
|
||||
b += hex;
|
||||
width = -1;
|
||||
has_unicode_escape = 'u';
|
||||
break;
|
||||
}
|
||||
case 'U':
|
||||
{
|
||||
if (width)
|
||||
{
|
||||
return add_error_token(lexer, "Unicode escapes are not allowed in a multi-character literal.");
|
||||
}
|
||||
int64_t hex = scan_hex_literal(lexer, 8);
|
||||
if (hex < 0)
|
||||
{
|
||||
lexer->lexing_start = start;
|
||||
return add_error_token(lexer, "Expected an eight character hex value after \\U.");
|
||||
}
|
||||
width = -1;
|
||||
b <<= 32U;
|
||||
b += hex;
|
||||
has_unicode_escape = 'U';
|
||||
break;
|
||||
}
|
||||
default:
|
||||
width++;
|
||||
b <<= 8U;
|
||||
b += (uint8_t)escape;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (width == 0)
|
||||
// Handle the problem with zero size character literal first.
|
||||
if (match(lexer, '\''))
|
||||
{
|
||||
return add_error_token(lexer, "The character literal was empty.");
|
||||
}
|
||||
if (width > 2 && width != 4 && width != 8)
|
||||
|
||||
int width = 0;
|
||||
char c;
|
||||
Int128 b = {};
|
||||
|
||||
while ((c = next(lexer)) != '\'')
|
||||
{
|
||||
add_error_token(lexer, "Character literals may only be 1, 2, 4 or 8 characters wide.");
|
||||
// End of file may occur:
|
||||
if (c == '\0') return add_error_token(lexer, "The character literal did not terminate.");
|
||||
// We might exceed the width that we allow.
|
||||
if (width > 15) return add_error_token(lexer, "The character literal exceeds 16 characters.");
|
||||
// Handle (expected) utf-8 characters.
|
||||
if ((unsigned)c >= (unsigned)0x80)
|
||||
{
|
||||
if (width != 0) goto UNICODE_IN_MULTI;
|
||||
const char *start = lexer->current;
|
||||
int64_t utf8 = scan_utf8(lexer, c);
|
||||
if (utf8 < 0) return false;
|
||||
if (!match(lexer, '\''))
|
||||
{
|
||||
if (peek(lexer) == '\0') continue;
|
||||
lexer->lexing_start = start;
|
||||
return add_error_token(lexer, "Unicode character literals may only contain one character, "
|
||||
"please remove the additional ones or use all ASCII.");
|
||||
}
|
||||
b.low = utf8;
|
||||
width = utf8 > 0xffff ? 4 : 2;
|
||||
goto DONE;
|
||||
}
|
||||
// Parse the escape code
|
||||
signed char escape = ' ';
|
||||
const char *start = lexer->current;
|
||||
if (c == '\\')
|
||||
{
|
||||
assert(c == '\\');
|
||||
c = next(lexer);
|
||||
escape = is_valid_escape(c);
|
||||
if (escape == -1)
|
||||
{
|
||||
lexer->lexing_start = start - 1;
|
||||
return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
|
||||
}
|
||||
}
|
||||
switch (escape)
|
||||
{
|
||||
case 'x':
|
||||
{
|
||||
int64_t hex = scan_hex_literal(lexer, 2);
|
||||
if (hex < 0)
|
||||
{
|
||||
lexer->lexing_start = start - 1;
|
||||
// Fix underlining if this is an unfinished escape.
|
||||
return add_error_token(lexer, "Expected a two character hex value after \\x.");
|
||||
}
|
||||
// We can now reassign c and use the default code.
|
||||
c = hex;
|
||||
break;
|
||||
}
|
||||
case 'u':
|
||||
case 'U':
|
||||
{
|
||||
// First check that we don't have any characters previous to this one.
|
||||
if (width != 0) goto UNICODE_IN_MULTI;
|
||||
int bytes = escape == 'U' ? 4 : 2;
|
||||
int64_t hex = scan_hex_literal(lexer, bytes * 2);
|
||||
// The hex parsing may have failed, lacking more hex chars.
|
||||
if (hex < 0)
|
||||
{
|
||||
lexer->lexing_start = start - 1;
|
||||
return add_error_token(lexer, "Expected %s character hex value after \\%c.",
|
||||
escape == 'u' ? "a four" : "an eight", escape);
|
||||
}
|
||||
// If we don't see the end here, then something is wrong.
|
||||
if (!match(lexer, '\''))
|
||||
{
|
||||
// It may be the end of the line, if so use the default handling by invoking "continue"
|
||||
if (peek(lexer) == '\0') continue;
|
||||
// Otherwise step forward and mark it as an error.
|
||||
next(lexer);
|
||||
lexer->lexing_start = lexer->current - 1;
|
||||
return add_error_token(lexer,
|
||||
"Character literals with '\\%c' can only contain one character, please remove this one.",
|
||||
escape);
|
||||
}
|
||||
// Assign the value and go to DONE.
|
||||
b.low = hex;
|
||||
width = bytes;
|
||||
goto DONE;
|
||||
}
|
||||
case ' ':
|
||||
// No escape, a regular character.
|
||||
break;
|
||||
default:
|
||||
c = (unsigned char)escape;
|
||||
break;
|
||||
}
|
||||
// Default handling here:
|
||||
width++;
|
||||
b = i128_shl64(b, 8);
|
||||
b = i128_add64(b, (unsigned char)c);
|
||||
}
|
||||
|
||||
assert(width > 0 && width <= 16);
|
||||
if (width > 8 && !platform_target.int128)
|
||||
{
|
||||
return add_error_token(lexer, "Character literal exceeded 8 characters.");
|
||||
}
|
||||
DONE:
|
||||
add_generic_token(lexer, TOKEN_CHAR_LITERAL);
|
||||
lexer->latest_token_data->char_value = b;
|
||||
lexer->latest_token_data->width = width < 0 ? 0 : (char)width;
|
||||
lexer->latest_token_data->width = (char)width;
|
||||
return true;
|
||||
|
||||
UNICODE_IN_MULTI:
|
||||
return add_error_token(lexer, "A multi-character literal may not contain unicode characters.");
|
||||
}
|
||||
|
||||
static inline void skip_first_line_if_empty(Lexer *lexer)
|
||||
@@ -1616,6 +1621,15 @@ static bool lexer_scan_token_inner(Lexer *lexer, LexMode mode)
|
||||
return scan_ident(lexer, TOKEN_HASH_IDENT, TOKEN_HASH_CONST_IDENT, TOKEN_HASH_TYPE_IDENT, '$');
|
||||
case '$':
|
||||
if (match(lexer, '{')) return add_token(lexer, TOKEN_PLACEHOLDER, "${");
|
||||
if (match(lexer, '$'))
|
||||
{
|
||||
if (is_letter(peek(lexer)))
|
||||
{
|
||||
add_token(lexer, TOKEN_BUILTIN, "$$");
|
||||
return scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
|
||||
}
|
||||
return add_error_token(lexer, "Expected a letter after $$.");
|
||||
}
|
||||
return scan_ident(lexer, TOKEN_CT_IDENT, TOKEN_CT_CONST_IDENT, TOKEN_CT_TYPE_IDENT, '$');
|
||||
case ',':
|
||||
return add_token(lexer, TOKEN_COMMA, ",");
|
||||
|
||||
@@ -4898,6 +4898,8 @@ void llvm_emit_expr(GenContext *c, BEValue *value, Expr *expr)
|
||||
case EXPR_UNDEF:
|
||||
// Should never reach this.
|
||||
UNREACHABLE
|
||||
case EXPR_BUILTIN:
|
||||
TODO
|
||||
case EXPR_DECL:
|
||||
llvm_emit_local_decl(c, expr->decl_expr);
|
||||
return;
|
||||
|
||||
@@ -970,6 +970,8 @@ static bool expr_is_pure(Expr *expr)
|
||||
if (!expr) return true;
|
||||
switch (expr->expr_kind)
|
||||
{
|
||||
case EXPR_BUILTIN:
|
||||
TODO
|
||||
case EXPR_CONST:
|
||||
case EXPR_CONST_IDENTIFIER:
|
||||
case EXPR_IDENTIFIER:
|
||||
|
||||
@@ -924,6 +924,16 @@ static Expr *parse_or_error_expr(Context *context, Expr *left)
|
||||
return else_expr;
|
||||
}
|
||||
|
||||
static Expr *parse_builtin(Context *context, Expr *left)
|
||||
{
|
||||
assert(!left && "Had left hand side");
|
||||
Expr *expr = EXPR_NEW_TOKEN(EXPR_BUILTIN, context->tok);
|
||||
advance_and_verify(context, TOKEN_BUILTIN);
|
||||
expr->builtin_expr.identifier = context->tok;
|
||||
CONSUME_OR(TOKEN_IDENT, poisoned_expr);
|
||||
RANGE_EXTEND_PREV(expr);
|
||||
return expr;
|
||||
}
|
||||
static Expr *parse_placeholder(Context *context, Expr *left)
|
||||
{
|
||||
assert(!left && "Had left hand side");
|
||||
@@ -1253,31 +1263,36 @@ static Expr *parse_char_lit(Context *context, Expr *left)
|
||||
Expr *expr_int = EXPR_NEW_TOKEN(EXPR_CONST, context->tok);
|
||||
expr_int->const_expr.is_character = true;
|
||||
TokenData *data = tokendata_from_id(context->tok.id);
|
||||
expr_int->const_expr.ixx.i = data->char_value;
|
||||
expr_int->const_expr.narrowable = true;
|
||||
expr_int->const_expr.const_kind = CONST_INTEGER;
|
||||
switch (data->width)
|
||||
{
|
||||
case 1:
|
||||
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U8);
|
||||
expr_int->type = type_char;
|
||||
expr_int->const_expr.ixx.type = TYPE_U8;
|
||||
break;
|
||||
case 2:
|
||||
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U16);
|
||||
expr_int->type = type_ushort;
|
||||
expr_int->const_expr.ixx.type = TYPE_U16;
|
||||
break;
|
||||
case 3:
|
||||
case 4:
|
||||
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U32);
|
||||
expr_int->type = type_uint;
|
||||
expr_int->const_expr.ixx.type = TYPE_U32;
|
||||
break;
|
||||
case 5:
|
||||
case 6:
|
||||
case 7:
|
||||
case 8:
|
||||
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U64);
|
||||
expr_int->type = type_ulong;
|
||||
expr_int->const_expr.ixx.type = TYPE_U64;
|
||||
break;
|
||||
default:
|
||||
expr_const_set_int(&expr_int->const_expr, data->char_value, TYPE_U64);
|
||||
expr_int->type = type_long;
|
||||
expr_int->const_expr.narrowable = true;
|
||||
expr_int->type = type_u128;
|
||||
expr_int->const_expr.ixx.type = TYPE_U128;
|
||||
break;
|
||||
}
|
||||
|
||||
advance(context);
|
||||
return expr_int;
|
||||
}
|
||||
@@ -1571,6 +1586,7 @@ ParseRule rules[TOKEN_EOF + 1] = {
|
||||
[TOKEN_NULL] = { parse_null, NULL, PREC_NONE },
|
||||
[TOKEN_INTEGER] = { parse_integer, NULL, PREC_NONE },
|
||||
[TOKEN_PLACEHOLDER] = { parse_placeholder, NULL, PREC_NONE },
|
||||
[TOKEN_BUILTIN] = { parse_builtin, NULL, PREC_NONE },
|
||||
[TOKEN_CHAR_LITERAL] = { parse_char_lit, NULL, PREC_NONE },
|
||||
[TOKEN_AT] = { parse_macro_expansion, NULL, PREC_NONE },
|
||||
[TOKEN_STRING] = { parse_string_literal, NULL, PREC_NONE },
|
||||
|
||||
@@ -893,6 +893,7 @@ Ast *parse_stmt(Context *context)
|
||||
case TOKEN_TRY:
|
||||
case TOKEN_CATCH:
|
||||
case TOKEN_BYTES:
|
||||
case TOKEN_BUILTIN:
|
||||
return parse_expr_stmt(context);
|
||||
case TOKEN_ASSERT:
|
||||
return parse_assert_stmt(context);
|
||||
|
||||
@@ -903,6 +903,7 @@ Expr *recursive_may_narrow_float(Expr *expr, Type *type)
|
||||
case EXPR_NOP:
|
||||
case EXPR_LEN:
|
||||
case EXPR_CATCH:
|
||||
case EXPR_BUILTIN:
|
||||
UNREACHABLE
|
||||
case EXPR_POST_UNARY:
|
||||
return recursive_may_narrow_float(expr->unary_expr.expr, type);
|
||||
@@ -1054,6 +1055,7 @@ Expr *recursive_may_narrow_int(Expr *expr, Type *type)
|
||||
case EXPR_UNDEF:
|
||||
case EXPR_CT_CALL:
|
||||
case EXPR_NOP:
|
||||
case EXPR_BUILTIN:
|
||||
UNREACHABLE
|
||||
case EXPR_POST_UNARY:
|
||||
return recursive_may_narrow_int(expr->unary_expr.expr, type);
|
||||
|
||||
@@ -273,6 +273,8 @@ bool expr_is_constant_eval(Expr *expr, ConstantEvalKind eval_kind)
|
||||
RETRY:
|
||||
switch (expr->expr_kind)
|
||||
{
|
||||
case EXPR_BUILTIN:
|
||||
return false;
|
||||
case EXPR_BITACCESS:
|
||||
case EXPR_ACCESS:
|
||||
expr = expr->access_expr.parent;
|
||||
@@ -6374,6 +6376,8 @@ static inline bool sema_analyse_expr_dispatch(Context *context, Expr *expr)
|
||||
if (!sema_analyse_var_decl(context, expr->decl_expr, true)) return false;
|
||||
expr->type = expr->decl_expr->type;
|
||||
return true;
|
||||
case EXPR_BUILTIN:
|
||||
TODO
|
||||
case EXPR_CT_CALL:
|
||||
return sema_expr_analyse_ct_call(context, expr);
|
||||
case EXPR_HASH_IDENT:
|
||||
|
||||
@@ -80,6 +80,8 @@ const char *token_type_to_string(TokenType type)
|
||||
return "|=";
|
||||
case TOKEN_BIT_XOR_ASSIGN:
|
||||
return "^=";
|
||||
case TOKEN_BUILTIN:
|
||||
return "$$";
|
||||
case TOKEN_DIV_ASSIGN:
|
||||
return "/=";
|
||||
case TOKEN_DOTDOT:
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
// #target: x64-darwin
|
||||
|
||||
char bar = '\xaf';
|
||||
char bar = '\x0F';
|
||||
|
||||
@@ -8,7 +10,13 @@ char eofk = '\u233'; // #error: Expected a four char
|
||||
|
||||
char zab = '\Uaokdokok'; // #error: Expected an eight
|
||||
char zab = '\Uaokdooekfoekfekfkeofkekok'; // #error: Expected an eight
|
||||
char eofk = '\UaUfko'; // #error: Expected an eight
|
||||
char eofkq = '\UaUfko'; // #error: Expected an eight
|
||||
|
||||
char foo = ' // #error: The character literal did not terminate
|
||||
char x1 = '\u0023a'; // #error: Character literals with '\u' can only contain one character
|
||||
char x2 = '\U00000023a'; // #error: Character literals with '\U' can only contain one character
|
||||
|
||||
char x = 'äö'; // #error: may only contain one character
|
||||
char feokf = '\9'; // #error: Invalid escape sequence '\9'
|
||||
char fje = '123456789012345678'; // #error: The character literal exceeds 16 characters.
|
||||
|
||||
char foekfe = ''; // #error: The character literal was empty
|
||||
Reference in New Issue
Block a user