mirror of
https://github.com/c3lang/c3c.git
synced 2026-02-27 12:01:16 +00:00
1477 lines
38 KiB
C
1477 lines
38 KiB
C
// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
|
||
// Use of this source code is governed by the GNU LGPLv3.0 license
|
||
// a copy of which can be found in the LICENSE file.
|
||
|
||
#include "compiler_internal.h"
|
||
|
||
static inline uint16_t check_col(intptr_t col)
|
||
{
|
||
if (col > 255) return 0;
|
||
return (uint16_t)col;
|
||
}
|
||
|
||
static inline unsigned check_row(intptr_t line)
|
||
{
|
||
return line > MAX_SOURCE_LOCATION_LEN ? 0 : (unsigned)line;
|
||
}
|
||
|
||
// --- Lexing general methods.
|
||
|
||
static bool lexer_scan_token_inner(Lexer *lexer);
|
||
|
||
static inline void begin_new_token(Lexer *lexer)
|
||
{
|
||
lexer->lexing_start = lexer->current;
|
||
lexer->start_row = lexer->current_row;
|
||
lexer->start_row_start = lexer->line_start;
|
||
}
|
||
|
||
// Peek at the current character in the buffer.
|
||
#define peek(lexer_) (*(lexer_)->current)
|
||
|
||
// Look at the prev character in the buffer.
|
||
#define prev(lexer_) ((lexer_)->current[-1])
|
||
|
||
// Peek one character ahead.
|
||
#define peek_next(lexer_) ((lexer_)->current[1])
|
||
|
||
// Is the current character '\0' if so we assume we reached the end.
|
||
#define reached_end(lexer_) (lexer_->current[0] == '\0')
|
||
|
||
// Step one character forward and return that character
|
||
INLINE char next(Lexer *lexer)
|
||
{
|
||
if (*lexer->current == '\n')
|
||
{
|
||
lexer->line_start = lexer->current + 1, lexer->current_row++;
|
||
}
|
||
return (++lexer->current)[0];
|
||
}
|
||
|
||
// Backtrack the buffer read one step.
|
||
static inline void backtrack(Lexer *lexer)
|
||
{
|
||
lexer->current--;
|
||
if (lexer->current[0] == '\n')
|
||
{
|
||
lexer->current_row--;
|
||
}
|
||
}
|
||
|
||
// Skip the x next characters.
|
||
static inline void skip(Lexer *lexer, int steps)
|
||
{
|
||
ASSERT0(steps > 0);
|
||
for (int i = 0; i < steps; i++)
|
||
{
|
||
next(lexer);
|
||
}
|
||
}
|
||
|
||
// Match a single character – if successful, more one step forward.
|
||
static inline bool match(Lexer *lexer, char expected)
|
||
{
|
||
if (lexer->current[0] != expected) return false;
|
||
next(lexer);
|
||
return true;
|
||
}
|
||
|
||
// --- Token creation
|
||
|
||
/**
|
||
* Allocate data for a token, including source location.
|
||
* This call is doing the basic allocation, with other functions
|
||
* filling out additional information.
|
||
**/
|
||
static inline void set_generic_token(Lexer *lexer, TokenType type)
|
||
{
|
||
|
||
lexer->token_type = type;
|
||
// Set the location.
|
||
lexer->data.lex_len = lexer->current - lexer->lexing_start;
|
||
lexer->data.lex_start = lexer->lexing_start;
|
||
uint32_t line = lexer->start_row;
|
||
uint32_t col;
|
||
uint32_t length;
|
||
if (line == lexer->current_row)
|
||
{
|
||
// Col is simple difference.
|
||
col = check_col(lexer->lexing_start - lexer->line_start + 1);
|
||
// Length is diff between current and start.
|
||
length = check_row(lexer->current - lexer->lexing_start);
|
||
}
|
||
else
|
||
{
|
||
// For multiline, we grab the diff from the starting line.
|
||
col = check_col(lexer->lexing_start - lexer->start_row_start + 1);
|
||
// But always set a single token length.
|
||
length = 1;
|
||
}
|
||
lexer->tok_span.length = length;
|
||
lexer->tok_span.col = col;
|
||
lexer->tok_span.row = line;
|
||
}
|
||
|
||
// Error? We simply generate an invalid token and print out the error.
|
||
static bool add_error_token(Lexer *lexer, const char *message, ...)
|
||
{
|
||
set_generic_token(lexer, TOKEN_INVALID_TOKEN);
|
||
va_list list;
|
||
va_start(list, message);
|
||
sema_verror_range(lexer->tok_span, message, list);
|
||
va_end(list);
|
||
return false;
|
||
}
|
||
|
||
// Error at the start of the lexing, with a single length.
|
||
static bool add_error_token_at_start(Lexer *lexer, const char *message, ...)
|
||
{
|
||
va_list list;
|
||
va_start(list, message);
|
||
SourceSpan location = {
|
||
.file_id = lexer->file->file_id,
|
||
.row = lexer->start_row,
|
||
.length = 1,
|
||
.col = check_col((lexer->lexing_start - lexer->start_row_start) + 1),
|
||
};
|
||
sema_verror_range(location, message, list);
|
||
va_end(list);
|
||
set_generic_token(lexer, TOKEN_INVALID_TOKEN);
|
||
return false;
|
||
}
|
||
|
||
// Create an error token at a particular place in the file.
|
||
// used for pointing out errors in strings etc.
|
||
static bool add_error_token_at(Lexer *lexer, const char *loc, uint32_t len, const char *message, ...)
|
||
{
|
||
va_list list;
|
||
va_start(list, message);
|
||
uint32_t current_line = lexer->current_row;
|
||
if (len > MAX_SOURCE_LOCATION_LEN) len = 0;
|
||
SourceSpan location = {
|
||
.file_id = lexer->file->file_id,
|
||
.row = current_line,
|
||
.length = len,
|
||
.col = check_col((loc - lexer->line_start) + 1),
|
||
};
|
||
sema_verror_range(location, message, list);
|
||
va_end(list);
|
||
set_generic_token(lexer, TOKEN_INVALID_TOKEN);
|
||
return false;
|
||
}
|
||
|
||
// Print an error at the current location.
|
||
static bool add_error_token_at_current(Lexer *lexer, const char *message, ...)
|
||
{
|
||
va_list list;
|
||
va_start(list, message);
|
||
uint32_t current_line = lexer->current_row;
|
||
SourceSpan location = {
|
||
.file_id = lexer->file->file_id,
|
||
.row = current_line,
|
||
.length = 1,
|
||
.col = check_col((lexer->current - lexer->line_start) + 1),
|
||
};
|
||
sema_verror_range(location, message, list);
|
||
va_end(list);
|
||
set_generic_token(lexer, TOKEN_INVALID_TOKEN);
|
||
return false;
|
||
}
|
||
|
||
// Add a new regular token.
|
||
static inline bool new_token(Lexer *lexer, TokenType type, const char *string)
|
||
{
|
||
set_generic_token(lexer, type);
|
||
lexer->data.string = string;
|
||
return true;
|
||
}
|
||
|
||
|
||
|
||
// --- Comment parsing
|
||
|
||
/**
|
||
* Parsing of the "//" line comment - skipping past the end.
|
||
*/
|
||
static inline void parse_line_comment(Lexer *lexer)
|
||
{
|
||
while (!reached_end(lexer) && peek(lexer) != '\n')
|
||
{
|
||
next(lexer);
|
||
}
|
||
// If we found EOL, then walk past '\n'
|
||
if (peek(lexer) == '\n') next(lexer);
|
||
}
|
||
|
||
/**
|
||
* Parse the common / * * / style multiline comments, allowing nesting.
|
||
**/
|
||
static inline void parse_multiline_comment(Lexer *lexer)
|
||
{
|
||
int nesting = 1;
|
||
while (1)
|
||
{
|
||
switch (peek(lexer))
|
||
{
|
||
case '*':
|
||
if (peek_next(lexer) == '/')
|
||
{
|
||
skip(lexer, 2);
|
||
nesting--;
|
||
if (nesting == 0) return;
|
||
continue;
|
||
}
|
||
break;
|
||
case '/':
|
||
if (peek_next(lexer) == '*')
|
||
{
|
||
skip(lexer, 2);
|
||
nesting++;
|
||
continue;
|
||
}
|
||
break;
|
||
case '\0':
|
||
// Reached eof - end.
|
||
return;
|
||
default:
|
||
break;
|
||
}
|
||
next(lexer);
|
||
}
|
||
}
|
||
|
||
|
||
/**
|
||
* Skip regular whitespace.
|
||
*/
|
||
static void skip_whitespace(Lexer *lexer)
|
||
{
|
||
while (1)
|
||
{
|
||
switch (peek(lexer))
|
||
{
|
||
case '/':
|
||
if (lexer->mode == LEX_CONTRACTS) return;
|
||
// The '//' case
|
||
if (peek_next(lexer) == '/')
|
||
{
|
||
skip(lexer, 2);
|
||
parse_line_comment(lexer);
|
||
continue;
|
||
}
|
||
// '/*'
|
||
if (peek_next(lexer) == '*')
|
||
{
|
||
skip(lexer, 2);
|
||
parse_multiline_comment(lexer);
|
||
continue;
|
||
}
|
||
return;
|
||
case '\n':
|
||
// Contract lexing sees '\n' as a token.
|
||
if (lexer->mode == LEX_CONTRACTS) return;
|
||
FALLTHROUGH;
|
||
case ' ':
|
||
case '\t':
|
||
case '\f':
|
||
next(lexer);
|
||
break;
|
||
case '\r':
|
||
// Already filtered out.
|
||
UNREACHABLE
|
||
default:
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
|
||
// --- Identifier scanning
|
||
|
||
// Parses identifiers. Note that this is a bit complicated here since
|
||
// we split identifiers into 2 types + find keywords.
|
||
static inline bool scan_ident(Lexer *lexer, TokenType normal, TokenType const_token, TokenType type_token, char prefix)
|
||
{
|
||
TokenType type = (TokenType)0;
|
||
uint32_t hash = FNV1_SEED;
|
||
if (prefix)
|
||
{
|
||
hash = FNV1a(prefix, hash);
|
||
}
|
||
char c;
|
||
while ((c = peek(lexer)) == '_')
|
||
{
|
||
hash = FNV1a(c, hash);
|
||
next(lexer);
|
||
}
|
||
while (1)
|
||
{
|
||
c = peek(lexer);
|
||
switch (c)
|
||
{
|
||
case LOWER_CHAR_CASE:
|
||
if (!type)
|
||
{
|
||
type = normal;
|
||
}
|
||
else if (type == const_token)
|
||
{
|
||
type = type_token;
|
||
}
|
||
break;
|
||
case UPPER_CHAR_CASE:
|
||
if (!type) type = const_token;
|
||
break;
|
||
case NUMBER_CHAR_CASE:
|
||
if (!type) return add_error_token(lexer, "A letter must precede any digit");
|
||
case '_':
|
||
break;
|
||
default:
|
||
goto EXIT;
|
||
}
|
||
hash = FNV1a(c, hash);
|
||
next(lexer);
|
||
}
|
||
// Allow bang!
|
||
if (peek(lexer) == '!' && type == normal)
|
||
{
|
||
hash = FNV1a('!', hash);
|
||
next(lexer);
|
||
}
|
||
EXIT:;
|
||
uint32_t len = (uint32_t)(lexer->current - lexer->lexing_start);
|
||
if (!type)
|
||
{
|
||
if (!prefix && len == 1) return new_token(lexer, TOKEN_UNDERSCORE, "_");
|
||
if (prefix && len == 1)
|
||
{
|
||
return add_error_token(lexer, "An identifier was expected after the '%c'.", prefix);
|
||
}
|
||
return add_error_token(lexer, "An identifier may not consist of only '_' characters.");
|
||
}
|
||
const char* interned_string = symtab_add(lexer->lexing_start, len, hash, &type);
|
||
switch (type)
|
||
{
|
||
case TOKEN_RETURN:
|
||
if (lexer->mode == LEX_CONTRACTS) type = TOKEN_IDENT;
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
return new_token(lexer, type, interned_string);
|
||
}
|
||
|
||
// --- Number scanning
|
||
|
||
/**
|
||
* For C3 we use the practice of f<bit-width> u<bit-width> and i<bit-width>
|
||
* @param lexer
|
||
* @param is_float
|
||
* @return
|
||
*/
|
||
static bool scan_number_suffix(Lexer *lexer, bool *is_float)
|
||
{
|
||
char c = peek(lexer);
|
||
if (!char_is_alphanum_(c)) return true;
|
||
switch (c | 32)
|
||
{
|
||
case 'l':
|
||
c = next(lexer);
|
||
if (*is_float)
|
||
{
|
||
return add_error_token_at_current(lexer, "Integer suffix '%c' is not valid for a floating point literal.", c);
|
||
}
|
||
break;
|
||
case 'u':
|
||
if (*is_float)
|
||
{
|
||
return add_error_token_at_current(lexer, "Integer suffix '%c' is not valid for a floating point literal.", c);
|
||
}
|
||
c = next(lexer);
|
||
if ((c | 32) == 'l')
|
||
{
|
||
c = next(lexer);
|
||
break;
|
||
}
|
||
while (char_is_digit(c = peek(lexer))) next(lexer);
|
||
break;
|
||
case 'i':
|
||
if (*is_float)
|
||
{
|
||
return add_error_token_at_current(lexer, "Integer suffix '%c' is not valid for a floating point literal.", c);
|
||
}
|
||
next(lexer);
|
||
while (char_is_digit(c = peek(lexer))) next(lexer);
|
||
break;
|
||
case 'f':
|
||
next(lexer);
|
||
*is_float = true;
|
||
while (char_is_digit(c = peek(lexer))) next(lexer);
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
if (char_is_alphanum_(c))
|
||
{
|
||
next(lexer);
|
||
return add_error_token(lexer, "This doesn't seem to be a valid literal.");
|
||
}
|
||
return true;
|
||
}
|
||
|
||
#define NEXT_AND_CHECK_NO_MULTIPLE_(lexer__) \
|
||
do { if (next(lexer__) == '_' && prev(lexer__) == '_') { \
|
||
return add_error_token_at_current(lexer__, "Multiple consecutive '_' are not allowed."); \
|
||
} } while(0);
|
||
/**
|
||
* Parsing octals. Here we depart from the (error prone) C style octals with initial zero e.g. 0231
|
||
* Instead we only support 0o prefix like 0o231. Note that lexing here doesn't actually parse the
|
||
* number itself.
|
||
*/
|
||
static bool scan_oct(Lexer *lexer)
|
||
{
|
||
if (!char_is_oct(peek(lexer)))
|
||
{
|
||
return add_error_token_at_current(lexer, "An expression starting with '0o' should be followed by octal numbers (0-7).");
|
||
}
|
||
next(lexer);
|
||
while (char_is_oct_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
|
||
|
||
if (char_is_digit(peek(lexer)))
|
||
{
|
||
return add_error_token_at_current(lexer, "An expression starting with '0o' should be followed by octal numbers (0-7).");
|
||
}
|
||
bool is_float = false;
|
||
if (!scan_number_suffix(lexer, &is_float)) return false;
|
||
if (is_float)
|
||
{
|
||
return add_error_token(lexer, "Octal literals cannot have a floating point suffix.");
|
||
}
|
||
return new_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
|
||
}
|
||
|
||
/**
|
||
* Binary style literals e.g. 0b10101011
|
||
**/
|
||
static bool scan_binary(Lexer *lexer)
|
||
{
|
||
if (!char_is_binary(peek(lexer)))
|
||
{
|
||
return add_error_token_at_current(lexer, "An expression starting with '0b' should be followed by binary digits (0-1).");
|
||
}
|
||
next(lexer);
|
||
while (char_is_binary_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
|
||
if (char_is_digit(peek((lexer))))
|
||
{
|
||
return add_error_token_at_current(lexer, "An expression starting with '0b' should be followed by binary digits (0-1).");
|
||
}
|
||
bool is_float = false;
|
||
if (!scan_number_suffix(lexer, &is_float)) return false;
|
||
if (is_float)
|
||
{
|
||
return add_error_token(lexer, "Binary literals cannot have a floating point suffix.");
|
||
}
|
||
return new_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
|
||
}
|
||
|
||
/**
|
||
* Scan the digit after the exponent, e.g +12 or -12 or 12
|
||
* @param lexer
|
||
* @return false if lexing failed.
|
||
*/
|
||
static inline bool scan_exponent(Lexer *lexer)
|
||
{
|
||
// Step past e/E or p/P
|
||
next(lexer);
|
||
char c = peek(lexer);
|
||
next(lexer);
|
||
// Step past +/-
|
||
if (c == '+' || c == '-')
|
||
{
|
||
c = peek(lexer);
|
||
next(lexer);
|
||
}
|
||
// Now we need at least one digit
|
||
if (!char_is_digit(c))
|
||
{
|
||
if (c == 0)
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token_at_current(lexer, "End of file was reached while parsing the exponent.");
|
||
}
|
||
if (c == '\n') return add_error_token(lexer, "End of line was reached while parsing the exponent.");
|
||
if (c < 31 || c > 127) add_error_token(lexer, "An unexpected character was found while parsing the exponent.");
|
||
return add_error_token(lexer, "Parsing the floating point exponent failed, because '%c' is not a number.", c);
|
||
}
|
||
// Step through all the digits.
|
||
while (char_is_digit(peek(lexer))) next(lexer);
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Scan a hex number, including floating point hex numbers of the format 0x31a31ff.21p12. Note that the
|
||
* exponent is written in decimal.
|
||
**/
|
||
static inline bool scan_hex(Lexer *lexer)
|
||
{
|
||
if (!char_is_hex(peek(lexer)))
|
||
{
|
||
return add_error_token_at_current(lexer, "'0x' starts a hexadecimal number, so the next character should be 0-9, a-f or A-F.");
|
||
}
|
||
next(lexer);
|
||
while (char_is_hex_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
|
||
bool is_float = false;
|
||
if (peek(lexer) == '.' && peek_next(lexer) != '.')
|
||
{
|
||
is_float = true;
|
||
next(lexer);
|
||
char c = peek(lexer);
|
||
if (c == '_') return add_error_token_at_current(lexer, "'_' is not allowed directly after decimal point, try removing it.");
|
||
if (char_is_hex(c)) next(lexer);
|
||
while (char_is_hex_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
|
||
}
|
||
char c = peek(lexer);
|
||
if (c == 'p' || c == 'P')
|
||
{
|
||
is_float = true;
|
||
if (!scan_exponent(lexer)) return false;
|
||
}
|
||
if (prev(lexer) == '_')
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token_at_current(lexer, "The number ended with '_', which isn't allowed, please remove it.");
|
||
}
|
||
if (!scan_number_suffix(lexer, &is_float)) return false;
|
||
return new_token(lexer, is_float ? TOKEN_REAL : TOKEN_INTEGER, lexer->lexing_start);
|
||
}
|
||
|
||
/**
|
||
* Scans integer and float decimal values.
|
||
*/
|
||
static inline bool scan_dec(Lexer *lexer)
|
||
{
|
||
ASSERT0(char_is_digit(peek(lexer)));
|
||
|
||
// Walk through the digits, we don't need to worry about
|
||
// initial _ because we only call this if we have a digit initially.
|
||
while (char_is_digit_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
|
||
|
||
// Assume no float.
|
||
bool is_float = false;
|
||
|
||
// If we have a single dot, we assume that we have a float.
|
||
// Note that this current parsing means we can't have functions on
|
||
// literals, like "123.sizeof", but we're fine with that.
|
||
if (peek(lexer) == '.' && peek_next(lexer) != '.')
|
||
{
|
||
is_float = true;
|
||
// Step past '.'
|
||
next(lexer);
|
||
// Check our rule to disallow 123._32
|
||
char c = peek(lexer);
|
||
if (c == '_') return add_error_token_at_current(lexer, "'_' is not allowed directly after decimal point, try removing it.");
|
||
// Now walk until we see no more digits.
|
||
// This allows 123. as a floating point number.
|
||
while (char_is_digit_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
|
||
}
|
||
char c = peek(lexer);
|
||
// We might have an exponential. We allow 123e1 and 123.e1 as floating point, so
|
||
// just set it to floating point and check the exponential.
|
||
if (c == 'e' || c == 'E')
|
||
{
|
||
is_float = true;
|
||
if (!scan_exponent(lexer)) return false;
|
||
}
|
||
|
||
if (prev(lexer) == '_')
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token_at_current(lexer, "The number ended with '_', which isn't allowed, please remove it.");
|
||
}
|
||
if (!scan_number_suffix(lexer, &is_float)) return false;
|
||
return new_token(lexer, is_float ? TOKEN_REAL : TOKEN_INTEGER, lexer->lexing_start);
|
||
}
|
||
|
||
/**
|
||
* Scan a digit, switching on initial zero on possible parsing schemes:
|
||
* 0x... -> Hex
|
||
* 0o... -> Octal
|
||
* 0b... -> Binary
|
||
*
|
||
* Default is decimal.
|
||
*
|
||
* It's actually pretty simple to add encoding schemes here, so for example Base64 could
|
||
* be added.
|
||
*/
|
||
static inline bool scan_digit(Lexer *lexer)
|
||
{
|
||
if (peek(lexer) == '0')
|
||
{
|
||
switch (peek_next(lexer))
|
||
{
|
||
case 'x':
|
||
case 'X':
|
||
skip(lexer, 2);
|
||
return scan_hex(lexer);
|
||
case 'o':
|
||
case 'O':
|
||
skip(lexer, 2);
|
||
return scan_oct(lexer);
|
||
case 'b':
|
||
case 'B':
|
||
skip(lexer, 2);
|
||
return scan_binary(lexer);
|
||
default:
|
||
break;
|
||
}
|
||
}
|
||
return scan_dec(lexer);
|
||
}
|
||
|
||
// --- Character & string scan
|
||
|
||
static inline int64_t scan_hex_literal(Lexer *lexer, int positions)
|
||
{
|
||
int64_t hex = 0;
|
||
for (int j = 0; j < positions; j++)
|
||
{
|
||
hex <<= 4U;
|
||
int i = char_hex_to_nibble(peek(lexer));
|
||
if (i < 0)
|
||
{
|
||
return -1;
|
||
}
|
||
next(lexer);
|
||
hex += i;
|
||
}
|
||
return hex;
|
||
}
|
||
|
||
static inline int64_t scan_utf8(Lexer *lexer, unsigned char c)
|
||
{
|
||
int utf8_bytes;
|
||
uint64_t result;
|
||
if (c < 0xc0) goto ERROR;
|
||
if (c <= 0xdf)
|
||
{
|
||
result = 0x1f & c;
|
||
utf8_bytes = 2;
|
||
}
|
||
else if (c <= 0xef)
|
||
{
|
||
result = 0xf & c;
|
||
utf8_bytes = 3;
|
||
}
|
||
else if (c <= 0xf7)
|
||
{
|
||
utf8_bytes = 4;
|
||
result = 0x7 & c;
|
||
}
|
||
else if (c <= 0xfb)
|
||
{
|
||
utf8_bytes = 5;
|
||
result = 0x3 & c;
|
||
}
|
||
else if (c <= 0xfd)
|
||
{
|
||
utf8_bytes = 6;
|
||
result = 0x1 & c;
|
||
}
|
||
else
|
||
{
|
||
goto ERROR;
|
||
}
|
||
for (int i = 1; i < utf8_bytes; i++)
|
||
{
|
||
result <<= 6U;
|
||
c = (unsigned char)peek(lexer);
|
||
if (c == '\0') return 0xFFFD;
|
||
next(lexer);
|
||
if ((c & 0xc0) != 0x80)
|
||
{
|
||
goto ERROR;
|
||
}
|
||
result += c & 0x3f;
|
||
}
|
||
return (int64_t)result;
|
||
ERROR:
|
||
add_error_token(lexer, "Invalid UTF-8 sequence.");
|
||
return -1;
|
||
}
|
||
|
||
/**
|
||
* Rules:
|
||
* 1. If ASCII or \xAB, accept up to 8 characters (16 with Int128 support), size is char, ushort, uint, ulong, UInt128
|
||
* 2. If UTF8, accept 1 UTF character, size is ushort normally, ulong on wide UTF characters, no additional characters accepted.
|
||
* 3. If \uABCD, convert to 16 bits, size is ushort, no additional characters accepted.
|
||
* 4. If \U01234567, convert to 32 bits, size is uint, no additional characters accepted.
|
||
*
|
||
* @param lexer
|
||
* @return
|
||
*/
|
||
static inline bool scan_char(Lexer *lexer)
|
||
{
|
||
|
||
// Handle the problem with zero size character literal first.
|
||
if (match(lexer, '\''))
|
||
{
|
||
return add_error_token(lexer, "The character literal was empty.");
|
||
}
|
||
|
||
int width = 0;
|
||
char c;
|
||
Int128 b = { 0, 0 };
|
||
|
||
while (!match(lexer, '\''))
|
||
{
|
||
c = peek(lexer);
|
||
next(lexer);
|
||
// End of file may occur:
|
||
if (c == '\0')
|
||
{
|
||
return add_error_token_at_start(lexer, "The character literal did not terminate.");
|
||
}
|
||
// We might exceed the width that we allow.
|
||
if (width > 15) return add_error_token_at_start(lexer, "The character literal exceeds 16 characters.");
|
||
// Handle (expected) utf-8 characters.
|
||
if ((unsigned)c >= (unsigned)0x80)
|
||
{
|
||
if (width != 0) goto UNICODE_IN_MULTI;
|
||
int64_t utf8 = scan_utf8(lexer, (unsigned char)c);
|
||
if (utf8 < 0) return false;
|
||
if (!match(lexer, '\''))
|
||
{
|
||
if (peek(lexer) == '\0') continue;
|
||
backtrack(lexer);
|
||
return add_error_token_at_current(lexer, "Unicode character literals may only contain one character, "
|
||
"please remove the additional ones or use all ASCII.");
|
||
}
|
||
b.low = (uint64_t) utf8;
|
||
width = utf8 > 0xffff ? 4 : 2;
|
||
goto DONE;
|
||
}
|
||
// Parse the escape code
|
||
signed char escape = ' ';
|
||
if (c == '\\')
|
||
{
|
||
ASSERT0(c == '\\');
|
||
c = peek(lexer);
|
||
escape = char_is_valid_escape(c);
|
||
if (escape == -1)
|
||
{
|
||
lexer->lexing_start += 1;
|
||
if (c > ' ' && c <= 127)
|
||
{
|
||
next(lexer);
|
||
return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
|
||
}
|
||
return add_error_token_at_current(lexer, "An escape sequence was expected after '\\'.");
|
||
}
|
||
next(lexer);
|
||
}
|
||
const char *escape_begin = lexer->current - 2;
|
||
switch (escape)
|
||
{
|
||
case 'x':
|
||
{
|
||
int64_t hex = scan_hex_literal(lexer, 2);
|
||
if (hex < 0)
|
||
{
|
||
return add_error_token_at(lexer, escape_begin, lexer->current - escape_begin, "Expected a two character hex value after \\x.");
|
||
}
|
||
// We can now reassign c and use the default code.
|
||
c = (char)hex;
|
||
break;
|
||
}
|
||
case 'u':
|
||
case 'U':
|
||
{
|
||
// First check that we don't have any characters previous to this one.
|
||
if (width != 0) goto UNICODE_IN_MULTI;
|
||
int bytes = escape == 'U' ? 4 : 2;
|
||
int64_t hex = scan_hex_literal(lexer, bytes * 2);
|
||
// The hex parsing may have failed, lacking more hex chars.
|
||
if (hex < 0)
|
||
{
|
||
begin_new_token(lexer);
|
||
return add_error_token_at(lexer, escape_begin, lexer->current - escape_begin,
|
||
"Expected %s character hex value after \\%c.",
|
||
escape == 'u' ? "a four" : "an eight", escape);
|
||
}
|
||
// If we don't see the end here, then something is wrong.
|
||
if (!match(lexer, '\''))
|
||
{
|
||
// It may be the end of the line, if so use the default handling by invoking "continue"
|
||
if (peek(lexer) == '\0') continue;
|
||
return add_error_token_at_current(lexer,
|
||
"Character literals with '\\%c' can only contain one character, please remove this one.",
|
||
escape);
|
||
}
|
||
// Assign the value and go to "DONE".
|
||
b.low = (uint64_t) hex;
|
||
width = bytes;
|
||
goto DONE;
|
||
}
|
||
case ' ':
|
||
// No escape, a regular character.
|
||
break;
|
||
default:
|
||
c = (signed char)escape;
|
||
break;
|
||
}
|
||
// Default handling here:
|
||
width++;
|
||
b = i128_shl64(b, 8);
|
||
b = i128_add64(b, (unsigned char)c);
|
||
}
|
||
ASSERT0(width > 0 && width <= 16);
|
||
DONE:
|
||
set_generic_token(lexer, TOKEN_CHAR_LITERAL);
|
||
lexer->data.char_value = b;
|
||
lexer->data.width = (char)width;
|
||
return true;
|
||
|
||
UNICODE_IN_MULTI:
|
||
return add_error_token(lexer, "A multi-character literal may not contain unicode characters.");
|
||
}
|
||
|
||
static int append_esc_string_token(char *restrict dest, const char *restrict src, size_t *pos)
|
||
{
|
||
int scanned;
|
||
uint64_t unicode_char;
|
||
signed char scanned_char = char_is_valid_escape(src[0]);
|
||
if (scanned_char < 0) return -1;
|
||
switch (scanned_char)
|
||
{
|
||
case 'x':
|
||
{
|
||
int h = char_hex_to_nibble(src[1]);
|
||
if (h < 0) return -1;
|
||
int l = char_hex_to_nibble(src[2]);
|
||
if (l < 0) return -1;
|
||
unicode_char = ((unsigned) h << 4U) + (unsigned)l;
|
||
scanned = 3;
|
||
break;
|
||
}
|
||
case 'u':
|
||
{
|
||
int x1 = char_hex_to_nibble(src[1]);
|
||
if (x1 < 0) return -1;
|
||
int x2 = char_hex_to_nibble(src[2]);
|
||
if (x2 < 0) return -1;
|
||
int x3 = char_hex_to_nibble(src[3]);
|
||
if (x3 < 0) return -1;
|
||
int x4 = char_hex_to_nibble(src[4]);
|
||
if (x4 < 0) return -1;
|
||
unicode_char = ((unsigned) x1 << 12U) + ((unsigned) x2 << 8U) + ((unsigned) x3 << 4U) + (unsigned)x4;
|
||
scanned = 5;
|
||
break;
|
||
}
|
||
case 'U':
|
||
{
|
||
int x1 = char_hex_to_nibble(src[1]);
|
||
if (x1 < 0) return -1;
|
||
int x2 = char_hex_to_nibble(src[2]);
|
||
if (x2 < 0) return -1;
|
||
int x3 = char_hex_to_nibble(src[3]);
|
||
if (x3 < 0) return -1;
|
||
int x4 = char_hex_to_nibble(src[4]);
|
||
if (x4 < 0) return -1;
|
||
int x5 = char_hex_to_nibble(src[5]);
|
||
if (x5 < 0) return -1;
|
||
int x6 = char_hex_to_nibble(src[6]);
|
||
if (x6 < 0) return -1;
|
||
int x7 = char_hex_to_nibble(src[7]);
|
||
if (x7 < 0) return -1;
|
||
int x8 = char_hex_to_nibble(src[8]);
|
||
if (x8 < 0) return -1;
|
||
unicode_char = ((unsigned) x1 << 28U) + ((unsigned) x2 << 24U) + ((unsigned) x3 << 20U) + ((unsigned) x4 << 16U) +
|
||
((unsigned) x5 << 12U) + ((unsigned) x6 << 8U) + ((unsigned) x7 << 4U) + (unsigned)x8;
|
||
scanned = 9;
|
||
break;
|
||
}
|
||
default:
|
||
dest[(*pos)++] = scanned_char;
|
||
return 1;
|
||
}
|
||
if (unicode_char < 0x80U)
|
||
{
|
||
dest[(*pos)++] = (char)unicode_char;
|
||
}
|
||
else if (unicode_char < 0x800U)
|
||
{
|
||
dest[(*pos)++] = (char)(0xC0U | (unicode_char >> 6U));
|
||
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
|
||
}
|
||
else if (unicode_char < 0x10000U)
|
||
{
|
||
dest[(*pos)++] = (char)(0xE0U | (unicode_char >> 12U));
|
||
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 6U) & 0x3FU));
|
||
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
|
||
}
|
||
else
|
||
{
|
||
dest[(*pos)++] = (char)(0xF0U | (unicode_char >> 18U));
|
||
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 12U) & 0x3FU));
|
||
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 6U) & 0x3FU));
|
||
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
|
||
}
|
||
return scanned;
|
||
}
|
||
|
||
|
||
static inline void consume_to_end_quote(Lexer *lexer)
|
||
{
|
||
char c;
|
||
while ((c = peek(lexer)) != '\0' && c != '"')
|
||
{
|
||
next(lexer);
|
||
}
|
||
}
|
||
|
||
static inline bool scan_string(Lexer *lexer)
|
||
{
|
||
char c = 0;
|
||
const char *current = lexer->current;
|
||
while ((c = *(current++)) != '"')
|
||
{
|
||
if (c == '\n' || c == '\0')
|
||
{
|
||
current++;
|
||
break;
|
||
}
|
||
if (c == '\\')
|
||
{
|
||
c = *current;
|
||
if (c != '\n' && c != '\0') current++;
|
||
continue;
|
||
}
|
||
}
|
||
const char *end = current - 1;
|
||
char *destination = malloc_string((size_t)(end - lexer->current + 1));
|
||
size_t len = 0;
|
||
while (lexer->current < end)
|
||
{
|
||
c = peek(lexer);
|
||
next(lexer);
|
||
if (c == '\0' || (c == '\\' && peek(lexer) == '\0'))
|
||
{
|
||
if (c == '\0') backtrack(lexer);
|
||
add_error_token_at_start(lexer, "The end of the file was reached "
|
||
"while parsing the string. "
|
||
"Did you forget (or accidentally add) a '\"' somewhere?");
|
||
consume_to_end_quote(lexer);
|
||
return false;
|
||
}
|
||
if (c == '\n' || (c == '\\' && peek(lexer) == '\n'))
|
||
{
|
||
|
||
backtrack(lexer);
|
||
add_error_token_at_start(lexer, "The end of the line was reached "
|
||
"while parsing the string. "
|
||
"Did you forget (or accidentally add) a '\"' somewhere?");
|
||
consume_to_end_quote(lexer);
|
||
return false;
|
||
}
|
||
if (c == '\\')
|
||
{
|
||
int scanned = append_esc_string_token(destination, lexer->current, &len);
|
||
if (scanned < 0)
|
||
{
|
||
add_error_token_at_current(lexer, "Invalid escape in string.");
|
||
consume_to_end_quote(lexer);
|
||
return false;
|
||
}
|
||
skip(lexer, scanned);
|
||
continue;
|
||
}
|
||
destination[len++] = c;
|
||
}
|
||
// Skip the `"`
|
||
next(lexer);
|
||
destination[len] = 0;
|
||
new_token(lexer, TOKEN_STRING, destination);
|
||
lexer->data.strlen = len;
|
||
return true;
|
||
}
|
||
|
||
static inline bool scan_raw_string(Lexer *lexer)
|
||
{
|
||
char c;
|
||
while (1)
|
||
{
|
||
c = peek(lexer);
|
||
next(lexer);
|
||
if (c == '`' && peek(lexer) != '`') break;
|
||
if (c == '\0')
|
||
{
|
||
return add_error_token_at_start(lexer, "Reached the end of the file looking for "
|
||
"the end of the raw string that starts "
|
||
"here. Did you forget a '`' somewhere?");
|
||
}
|
||
if (c == '`') next(lexer);
|
||
}
|
||
const char *current = lexer->lexing_start + 1;
|
||
const char *end = lexer->current - 1;
|
||
size_t len = (size_t)(end - current);
|
||
char *destination = malloc_string(len + 1);
|
||
len = 0;
|
||
while (current < end)
|
||
{
|
||
c = *(current++);
|
||
if (c == '`' && current[0] == '`')
|
||
{
|
||
current++;
|
||
}
|
||
destination[len++] = c;
|
||
}
|
||
destination[len] = 0;
|
||
new_token(lexer, TOKEN_STRING, destination);
|
||
lexer->data.strlen = len;
|
||
return true;
|
||
}
|
||
|
||
static inline bool scan_hex_array(Lexer *lexer)
|
||
{
|
||
char start_char = peek(lexer);
|
||
next(lexer); // Step past ' or " `
|
||
char c;
|
||
uint64_t len = 0;
|
||
while (1)
|
||
{
|
||
c = peek(lexer);
|
||
if (c == 0)
|
||
{
|
||
return add_error_token_at_current(lexer, "The hex string seems to be missing a terminating '%c'", start_char);
|
||
}
|
||
if (c == start_char) break;
|
||
if (char_is_hex(c))
|
||
{
|
||
next(lexer);
|
||
len++;
|
||
continue;
|
||
}
|
||
if (char_is_whitespace(c))
|
||
{
|
||
next(lexer);
|
||
continue;
|
||
}
|
||
if (c > ' ' && c < 127)
|
||
{
|
||
return add_error_token_at_current(lexer,
|
||
"'%c' isn't a valid hexadecimal digit, all digits should be a-z, A-Z and 0-9.",
|
||
c);
|
||
}
|
||
return add_error_token_at_current(lexer,
|
||
"This isn't a valid hexadecimal digit, all digits should be a-z, A-Z and 0-9.");
|
||
}
|
||
next(lexer);
|
||
if (len % 2)
|
||
{
|
||
return add_error_token(lexer, "The hexadecimal string is not an even length, did you miss a digit somewhere?");
|
||
}
|
||
if (!new_token(lexer, TOKEN_BYTES, lexer->lexing_start)) return false;
|
||
lexer->data.is_base64 = false;
|
||
lexer->data.bytes_len = (uint64_t)len / 2;
|
||
return true;
|
||
}
|
||
|
||
// Scan b64"abc=" and b64'abc='
|
||
static inline bool scan_base64(Lexer *lexer)
|
||
{
|
||
next(lexer); // Step past 6
|
||
next(lexer); // Step past 4
|
||
char start_char = peek(lexer);
|
||
next(lexer); // Step past ' or " or `
|
||
char c;
|
||
unsigned end_len = 0;
|
||
uint64_t len = 0;
|
||
while (1)
|
||
{
|
||
c = peek(lexer);
|
||
if (c == 0)
|
||
{
|
||
return add_error_token_at_start(lexer, "The base64 string seems to be missing a terminating '%c'", start_char);
|
||
}
|
||
next(lexer);
|
||
if (c == start_char) break;
|
||
if (char_is_base64(c))
|
||
{
|
||
if (end_len)
|
||
{
|
||
return add_error_token_at_current(lexer, "'%c' can't be placed after an ending '='", c);
|
||
}
|
||
len++;
|
||
continue;
|
||
}
|
||
if (c == '=')
|
||
{
|
||
if (end_len > 1)
|
||
{
|
||
return add_error_token_at_current(lexer, "There cannot be more than 2 '=' at the end of a base64 string.", c);
|
||
}
|
||
end_len++;
|
||
continue;
|
||
}
|
||
if (!char_is_whitespace(c))
|
||
{
|
||
if (c < ' ' || c > 127)
|
||
{
|
||
return add_error_token_at_current(lexer, "A valid base64 character was expected here.");
|
||
}
|
||
return add_error_token_at_current(lexer, "'%c' is not a valid base64 character.", c);
|
||
}
|
||
}
|
||
if (!end_len && len % 4 != 0)
|
||
{
|
||
switch (len % 4)
|
||
{
|
||
case 0:
|
||
case 1:
|
||
// Invalid
|
||
break;
|
||
case 2:
|
||
end_len = 2;
|
||
break;
|
||
case 3:
|
||
end_len = 1;
|
||
break;
|
||
default:
|
||
UNREACHABLE
|
||
}
|
||
if (len % 4 == 3)
|
||
{
|
||
end_len = 1;
|
||
}
|
||
}
|
||
if ((len + end_len) % 4 != 0)
|
||
{
|
||
return add_error_token_at_start(lexer, "Base64 strings must either be padded to multiple of 4, or if unpadded "
|
||
"- only need 1 or 2 bytes of extra padding.");
|
||
}
|
||
uint64_t decoded_len = (3 * len - end_len) / 4;
|
||
if (!new_token(lexer, TOKEN_BYTES, lexer->lexing_start)) return false;
|
||
lexer->data.is_base64 = true;
|
||
lexer->data.bytes_len = decoded_len;
|
||
return true;
|
||
}
|
||
|
||
|
||
|
||
// --- Lexer doc lexing
|
||
|
||
/**
|
||
* Parse the <* *> directives comments
|
||
**/
|
||
static bool parse_doc_start(Lexer *lexer)
|
||
{
|
||
bool may_have_contract = true;
|
||
// Let's loop until we find the end or the contract.
|
||
while (!reached_end(lexer))
|
||
{
|
||
char c = peek(lexer);
|
||
switch (c)
|
||
{
|
||
case '\n':
|
||
may_have_contract = true;
|
||
next(lexer);
|
||
continue;
|
||
case ' ':
|
||
case '\t':
|
||
next(lexer);
|
||
continue;
|
||
case '*':
|
||
// We might have <* Hello *>
|
||
if (peek_next(lexer) == '>') goto EXIT;
|
||
may_have_contract = false;
|
||
next(lexer);
|
||
continue;
|
||
case '@':
|
||
if (may_have_contract && char_is_lower(peek_next(lexer)))
|
||
{
|
||
// Found a contract
|
||
goto EXIT;
|
||
}
|
||
FALLTHROUGH;
|
||
default:
|
||
may_have_contract = false;
|
||
next(lexer);
|
||
continue;
|
||
}
|
||
}
|
||
EXIT:;
|
||
// Now we either found:
|
||
// 1. "<* foo \n @param"
|
||
// 2. "<* foo *>"
|
||
// 3. "<* foo <eof>"
|
||
//
|
||
// In any case we can consider this having reached "the contracts"
|
||
lexer->mode = LEX_CONTRACTS;
|
||
return new_token(lexer, TOKEN_DOCS_START, lexer->lexing_start);
|
||
}
|
||
|
||
static bool lexer_scan_token_inner(Lexer *lexer)
|
||
{
|
||
// Now skip the whitespace.
|
||
skip_whitespace(lexer);
|
||
|
||
// Point start to the first non-whitespace character.
|
||
begin_new_token(lexer);
|
||
|
||
if (reached_end(lexer)) return new_token(lexer, TOKEN_EOF, "\n") && false;
|
||
|
||
char c = peek(lexer);
|
||
next(lexer);
|
||
switch (c)
|
||
{
|
||
case '\n':
|
||
ASSERT0(lexer->mode == LEX_CONTRACTS);
|
||
return new_token(lexer, TOKEN_DOCS_EOL, "<eol>");
|
||
case '@':
|
||
if (char_is_letter_(peek(lexer)))
|
||
{
|
||
return scan_ident(lexer, TOKEN_AT_IDENT, TOKEN_AT_CONST_IDENT, TOKEN_AT_TYPE_IDENT, '@');
|
||
}
|
||
return new_token(lexer, TOKEN_AT, "@");
|
||
case '\'':
|
||
return scan_char(lexer);
|
||
case '`':
|
||
return scan_raw_string(lexer);
|
||
case '"':
|
||
return scan_string(lexer);
|
||
case '#':
|
||
return scan_ident(lexer, TOKEN_HASH_IDENT, TOKEN_HASH_CONST_IDENT, TOKEN_HASH_TYPE_IDENT, '#');
|
||
case '$':
|
||
if (match(lexer, '$'))
|
||
{
|
||
if (char_is_letter(peek(lexer)))
|
||
{
|
||
return new_token(lexer, TOKEN_BUILTIN, "$$");
|
||
}
|
||
return add_error_token_at_current(lexer, "Expected a letter after $$.");
|
||
}
|
||
return scan_ident(lexer, TOKEN_CT_IDENT, TOKEN_CT_CONST_IDENT, TOKEN_CT_TYPE_IDENT, '$');
|
||
case ',':
|
||
return new_token(lexer, TOKEN_COMMA, ",");
|
||
case ';':
|
||
return new_token(lexer, TOKEN_EOS, ";");
|
||
case '{':
|
||
return match(lexer, '|') ? new_token(lexer, TOKEN_LBRAPIPE, "{|") : new_token(lexer, TOKEN_LBRACE, "{");
|
||
case '}':
|
||
return new_token(lexer, TOKEN_RBRACE, "}");
|
||
case '(':
|
||
return match(lexer, '<') ? new_token(lexer, TOKEN_LGENPAR, "(<") : new_token(lexer, TOKEN_LPAREN, "(");
|
||
case ')':
|
||
return new_token(lexer, TOKEN_RPAREN, ")");
|
||
case '[':
|
||
if (match(lexer, '<')) return new_token(lexer, TOKEN_LVEC, "[<");
|
||
return new_token(lexer, TOKEN_LBRACKET, "[");
|
||
case ']':
|
||
return new_token(lexer, TOKEN_RBRACKET, "]");
|
||
case '.':
|
||
if (match(lexer, '.'))
|
||
{
|
||
if (match(lexer, '.')) return new_token(lexer, TOKEN_ELLIPSIS, "...");
|
||
return new_token(lexer, TOKEN_DOTDOT, "..");
|
||
}
|
||
return new_token(lexer, TOKEN_DOT, ".");
|
||
case '~':
|
||
return new_token(lexer, TOKEN_BIT_NOT, "~");
|
||
case ':':
|
||
return match(lexer, ':') ? new_token(lexer, TOKEN_SCOPE, "::") : new_token(lexer, TOKEN_COLON, ":");
|
||
case '!':
|
||
if (match(lexer, '!')) return new_token(lexer, TOKEN_BANGBANG, "!!");
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_NOT_EQUAL, "!=") : new_token(lexer, TOKEN_BANG, "!");
|
||
case '/':
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_DIV_ASSIGN, "/=") : new_token(lexer, TOKEN_DIV, "/");
|
||
case '*':
|
||
if (lexer->mode == LEX_CONTRACTS && match(lexer, '>'))
|
||
{
|
||
lexer->mode = LEX_NORMAL;
|
||
return new_token(lexer, TOKEN_DOCS_END, "*>");
|
||
}
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_MULT_ASSIGN, "*=") : new_token(lexer, TOKEN_STAR, "*");
|
||
case '=':
|
||
if (match(lexer, '>')) return new_token(lexer, TOKEN_IMPLIES, "=>");
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_EQEQ, "==") : new_token(lexer, TOKEN_EQ, "=");
|
||
case '^':
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_BIT_XOR_ASSIGN, "^=") : new_token(lexer, TOKEN_BIT_XOR, "^");
|
||
case '?':
|
||
if (match(lexer, '?')) return new_token(lexer, TOKEN_QUESTQUEST, "??");
|
||
return match(lexer, ':') ? new_token(lexer, TOKEN_ELVIS, "?:") : new_token(lexer, TOKEN_QUESTION, "?");
|
||
case '<':
|
||
if (match(lexer, '<'))
|
||
{
|
||
if (match(lexer, '=')) return new_token(lexer, TOKEN_SHL_ASSIGN, "<<=");
|
||
return new_token(lexer, TOKEN_SHL, "<<");
|
||
}
|
||
if (lexer->mode == LEX_NORMAL && match(lexer, '*'))
|
||
{
|
||
return parse_doc_start(lexer);
|
||
}
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_LESS_EQ, "<=") : new_token(lexer, TOKEN_LESS, "<");
|
||
case '>':
|
||
if (match(lexer, '>'))
|
||
{
|
||
if (match(lexer, '=')) return new_token(lexer, TOKEN_SHR_ASSIGN, ">>=");
|
||
return new_token(lexer, TOKEN_SHR, ">>");
|
||
}
|
||
if (match(lexer, ')')) return new_token(lexer, TOKEN_RGENPAR, ">)");
|
||
if (match(lexer, ']')) return new_token(lexer, TOKEN_RVEC, ">]");
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_GREATER_EQ, ">=") : new_token(lexer, TOKEN_GREATER, ">");
|
||
case '%':
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_MOD_ASSIGN, "%=") : new_token(lexer, TOKEN_MOD, "%");
|
||
case '&':
|
||
if (match(lexer, '&'))
|
||
{
|
||
return match(lexer, '&') ? new_token(lexer, TOKEN_CT_AND, "&&&") : new_token(lexer, TOKEN_AND, "&&");
|
||
}
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_BIT_AND_ASSIGN, "&=") : new_token(lexer, TOKEN_AMP, "&");
|
||
case '|':
|
||
if (match(lexer, '}')) return new_token(lexer, TOKEN_RBRAPIPE, "|}");
|
||
if (match(lexer, '|'))
|
||
{
|
||
return match(lexer, '|') ? new_token(lexer, TOKEN_CT_OR, "|||") : new_token(lexer, TOKEN_OR, "||");
|
||
}
|
||
return match(lexer, '=') ? new_token(lexer, TOKEN_BIT_OR_ASSIGN, "|=") : new_token(lexer, TOKEN_BIT_OR,
|
||
"|");
|
||
case '+':
|
||
if (match(lexer, '+'))
|
||
{
|
||
if (match(lexer, '+')) return new_token(lexer, TOKEN_CT_CONCAT, "+++");
|
||
return new_token(lexer, TOKEN_PLUSPLUS, "++");
|
||
}
|
||
if (match(lexer, '=')) return new_token(lexer, TOKEN_PLUS_ASSIGN, "+=");
|
||
return new_token(lexer, TOKEN_PLUS, "+");
|
||
case '-':
|
||
if (match(lexer, '>')) return new_token(lexer, TOKEN_ARROW, "->");
|
||
if (match(lexer, '-')) return new_token(lexer, TOKEN_MINUSMINUS, "--");
|
||
if (match(lexer, '=')) return new_token(lexer, TOKEN_MINUS_ASSIGN, "-=");
|
||
return new_token(lexer, TOKEN_MINUS, "-");
|
||
case 'x':
|
||
if ((peek(lexer) == '"' || peek(lexer) == '\'' || peek(lexer) == '`'))
|
||
{
|
||
return scan_hex_array(lexer);
|
||
}
|
||
goto IDENT;
|
||
case 'b':
|
||
if (peek(lexer) == '6' && peek_next(lexer) == '4' && (lexer->current[2] == '\'' || lexer->current[2] == '"' || lexer->current[2] == '`'))
|
||
{
|
||
return scan_base64(lexer);
|
||
}
|
||
goto IDENT;
|
||
case '_':
|
||
IDENT:
|
||
backtrack(lexer);
|
||
return scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
|
||
default:
|
||
if (c >= '0' && c <= '9')
|
||
{
|
||
backtrack(lexer);
|
||
return scan_digit(lexer);
|
||
}
|
||
if (c >= 'a' && c <= 'z') goto IDENT;
|
||
if (c >= 'A' && c <= 'Z') goto IDENT;
|
||
if (c < 0)
|
||
{
|
||
return add_error_token(lexer, "The 0x%x character may not be placed outside of a string or comment, did you forget a \" somewhere?", (uint8_t)c);
|
||
}
|
||
return add_error_token(lexer, "'%c' may not be placed outside of a string or comment, did you perhaps forget a \" somewhere?", c);
|
||
|
||
}
|
||
}
|
||
|
||
INLINE void check_bidirectional_markers(Lexer *lexer)
|
||
{
|
||
// First we check for bidirectional markers.
|
||
const unsigned char *check = (const unsigned char *)lexer->current;
|
||
unsigned c;
|
||
int balance = 0;
|
||
// Loop until end.
|
||
while ((c = *(check++)) != '\0')
|
||
{
|
||
if (c != 0xE2) continue;
|
||
// Possible marker.
|
||
unsigned char next = check[0];
|
||
if (next == 0) break;
|
||
unsigned char type = check[1];
|
||
switch (check[0])
|
||
{
|
||
case 0x80:
|
||
if (type == 0xAC)
|
||
{
|
||
balance--;
|
||
if (balance < 0) goto DONE;
|
||
}
|
||
if (type >= 0xAA && type <= 0xAE)
|
||
{
|
||
balance++;
|
||
}
|
||
break;
|
||
case 0x81:
|
||
if (type >= 0xA6 && type <= 0xA8)
|
||
{
|
||
balance++;
|
||
}
|
||
else if (type == 0xA9)
|
||
{
|
||
balance--;
|
||
if (balance < 0) goto DONE;
|
||
}
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
}
|
||
DONE:
|
||
// Check for unbalanced result
|
||
if (balance != 0)
|
||
{
|
||
add_error_token_at_start(lexer, "Invalid encoding - Unbalanced bidirectional markers.");
|
||
return;
|
||
}
|
||
}
|
||
|
||
// Initialize a file
|
||
void lexer_init(Lexer *lexer)
|
||
{
|
||
// Set the current file.
|
||
lexer->file_begin = lexer->file->contents;
|
||
// Set current to beginning.
|
||
lexer->current = lexer->file_begin;
|
||
// Line start is current.
|
||
lexer->line_start = lexer->current;
|
||
// Row number starts at 1
|
||
lexer->current_row = 1;
|
||
// File id is the current file.
|
||
lexer->tok_span.file_id = lexer->file->file_id;
|
||
// Mode is NORMAL
|
||
lexer->mode = LEX_NORMAL;
|
||
// Set up lexing for a new token.
|
||
begin_new_token(lexer);
|
||
// Check for bidirectional markers.
|
||
check_bidirectional_markers(lexer);
|
||
}
|
||
|
||
bool lexer_next_token(Lexer *lexer)
|
||
{
|
||
// Scan for a token.
|
||
if (lexer_scan_token_inner(lexer)) return true;
|
||
// Failed, so check if we're at end:
|
||
if (reached_end(lexer)) return true;
|
||
// Scan through the rest of the text for other invalid tokens:
|
||
bool token_is_ok = false;
|
||
do
|
||
{
|
||
if (!token_is_ok)
|
||
{
|
||
// Scan to the end of the line if we have an error.
|
||
while (!reached_end(lexer) && peek(lexer) != '\n') next(lexer);
|
||
}
|
||
token_is_ok = lexer_scan_token_inner(lexer);
|
||
}
|
||
while (!reached_end(lexer));
|
||
// Done.
|
||
return false;
|
||
}
|
||
|