Files
c3c/src/compiler/lexer.c

1477 lines
38 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
// Use of this source code is governed by the GNU LGPLv3.0 license
// a copy of which can be found in the LICENSE file.
#include "compiler_internal.h"
static inline uint16_t check_col(intptr_t col)
{
if (col > 255) return 0;
return (uint16_t)col;
}
static inline unsigned check_row(intptr_t line)
{
return line > MAX_SOURCE_LOCATION_LEN ? 0 : (unsigned)line;
}
// --- Lexing general methods.
static bool lexer_scan_token_inner(Lexer *lexer);
static inline void begin_new_token(Lexer *lexer)
{
lexer->lexing_start = lexer->current;
lexer->start_row = lexer->current_row;
lexer->start_row_start = lexer->line_start;
}
// Peek at the current character in the buffer.
#define peek(lexer_) (*(lexer_)->current)
// Look at the prev character in the buffer.
#define prev(lexer_) ((lexer_)->current[-1])
// Peek one character ahead.
#define peek_next(lexer_) ((lexer_)->current[1])
// Is the current character '\0' if so we assume we reached the end.
#define reached_end(lexer_) (lexer_->current[0] == '\0')
// Step one character forward and return that character
INLINE char next(Lexer *lexer)
{
if (*lexer->current == '\n')
{
lexer->line_start = lexer->current + 1, lexer->current_row++;
}
return (++lexer->current)[0];
}
// Backtrack the buffer read one step.
static inline void backtrack(Lexer *lexer)
{
lexer->current--;
if (lexer->current[0] == '\n')
{
lexer->current_row--;
}
}
// Skip the x next characters.
static inline void skip(Lexer *lexer, int steps)
{
ASSERT0(steps > 0);
for (int i = 0; i < steps; i++)
{
next(lexer);
}
}
// Match a single character if successful, more one step forward.
static inline bool match(Lexer *lexer, char expected)
{
if (lexer->current[0] != expected) return false;
next(lexer);
return true;
}
// --- Token creation
/**
* Allocate data for a token, including source location.
* This call is doing the basic allocation, with other functions
* filling out additional information.
**/
static inline void set_generic_token(Lexer *lexer, TokenType type)
{
lexer->token_type = type;
// Set the location.
lexer->data.lex_len = lexer->current - lexer->lexing_start;
lexer->data.lex_start = lexer->lexing_start;
uint32_t line = lexer->start_row;
uint32_t col;
uint32_t length;
if (line == lexer->current_row)
{
// Col is simple difference.
col = check_col(lexer->lexing_start - lexer->line_start + 1);
// Length is diff between current and start.
length = check_row(lexer->current - lexer->lexing_start);
}
else
{
// For multiline, we grab the diff from the starting line.
col = check_col(lexer->lexing_start - lexer->start_row_start + 1);
// But always set a single token length.
length = 1;
}
lexer->tok_span.length = length;
lexer->tok_span.col = col;
lexer->tok_span.row = line;
}
// Error? We simply generate an invalid token and print out the error.
static bool add_error_token(Lexer *lexer, const char *message, ...)
{
set_generic_token(lexer, TOKEN_INVALID_TOKEN);
va_list list;
va_start(list, message);
sema_verror_range(lexer->tok_span, message, list);
va_end(list);
return false;
}
// Error at the start of the lexing, with a single length.
static bool add_error_token_at_start(Lexer *lexer, const char *message, ...)
{
va_list list;
va_start(list, message);
SourceSpan location = {
.file_id = lexer->file->file_id,
.row = lexer->start_row,
.length = 1,
.col = check_col((lexer->lexing_start - lexer->start_row_start) + 1),
};
sema_verror_range(location, message, list);
va_end(list);
set_generic_token(lexer, TOKEN_INVALID_TOKEN);
return false;
}
// Create an error token at a particular place in the file.
// used for pointing out errors in strings etc.
static bool add_error_token_at(Lexer *lexer, const char *loc, uint32_t len, const char *message, ...)
{
va_list list;
va_start(list, message);
uint32_t current_line = lexer->current_row;
if (len > MAX_SOURCE_LOCATION_LEN) len = 0;
SourceSpan location = {
.file_id = lexer->file->file_id,
.row = current_line,
.length = len,
.col = check_col((loc - lexer->line_start) + 1),
};
sema_verror_range(location, message, list);
va_end(list);
set_generic_token(lexer, TOKEN_INVALID_TOKEN);
return false;
}
// Print an error at the current location.
static bool add_error_token_at_current(Lexer *lexer, const char *message, ...)
{
va_list list;
va_start(list, message);
uint32_t current_line = lexer->current_row;
SourceSpan location = {
.file_id = lexer->file->file_id,
.row = current_line,
.length = 1,
.col = check_col((lexer->current - lexer->line_start) + 1),
};
sema_verror_range(location, message, list);
va_end(list);
set_generic_token(lexer, TOKEN_INVALID_TOKEN);
return false;
}
// Add a new regular token.
static inline bool new_token(Lexer *lexer, TokenType type, const char *string)
{
set_generic_token(lexer, type);
lexer->data.string = string;
return true;
}
// --- Comment parsing
/**
* Parsing of the "//" line comment - skipping past the end.
*/
static inline void parse_line_comment(Lexer *lexer)
{
while (!reached_end(lexer) && peek(lexer) != '\n')
{
next(lexer);
}
// If we found EOL, then walk past '\n'
if (peek(lexer) == '\n') next(lexer);
}
/**
* Parse the common / * * / style multiline comments, allowing nesting.
**/
static inline void parse_multiline_comment(Lexer *lexer)
{
int nesting = 1;
while (1)
{
switch (peek(lexer))
{
case '*':
if (peek_next(lexer) == '/')
{
skip(lexer, 2);
nesting--;
if (nesting == 0) return;
continue;
}
break;
case '/':
if (peek_next(lexer) == '*')
{
skip(lexer, 2);
nesting++;
continue;
}
break;
case '\0':
// Reached eof - end.
return;
default:
break;
}
next(lexer);
}
}
/**
* Skip regular whitespace.
*/
static void skip_whitespace(Lexer *lexer)
{
while (1)
{
switch (peek(lexer))
{
case '/':
if (lexer->mode == LEX_CONTRACTS) return;
// The '//' case
if (peek_next(lexer) == '/')
{
skip(lexer, 2);
parse_line_comment(lexer);
continue;
}
// '/*'
if (peek_next(lexer) == '*')
{
skip(lexer, 2);
parse_multiline_comment(lexer);
continue;
}
return;
case '\n':
// Contract lexing sees '\n' as a token.
if (lexer->mode == LEX_CONTRACTS) return;
FALLTHROUGH;
case ' ':
case '\t':
case '\f':
next(lexer);
break;
case '\r':
// Already filtered out.
UNREACHABLE
default:
return;
}
}
}
// --- Identifier scanning
// Parses identifiers. Note that this is a bit complicated here since
// we split identifiers into 2 types + find keywords.
static inline bool scan_ident(Lexer *lexer, TokenType normal, TokenType const_token, TokenType type_token, char prefix)
{
TokenType type = (TokenType)0;
uint32_t hash = FNV1_SEED;
if (prefix)
{
hash = FNV1a(prefix, hash);
}
char c;
while ((c = peek(lexer)) == '_')
{
hash = FNV1a(c, hash);
next(lexer);
}
while (1)
{
c = peek(lexer);
switch (c)
{
case LOWER_CHAR_CASE:
if (!type)
{
type = normal;
}
else if (type == const_token)
{
type = type_token;
}
break;
case UPPER_CHAR_CASE:
if (!type) type = const_token;
break;
case NUMBER_CHAR_CASE:
if (!type) return add_error_token(lexer, "A letter must precede any digit");
case '_':
break;
default:
goto EXIT;
}
hash = FNV1a(c, hash);
next(lexer);
}
// Allow bang!
if (peek(lexer) == '!' && type == normal)
{
hash = FNV1a('!', hash);
next(lexer);
}
EXIT:;
uint32_t len = (uint32_t)(lexer->current - lexer->lexing_start);
if (!type)
{
if (!prefix && len == 1) return new_token(lexer, TOKEN_UNDERSCORE, "_");
if (prefix && len == 1)
{
return add_error_token(lexer, "An identifier was expected after the '%c'.", prefix);
}
return add_error_token(lexer, "An identifier may not consist of only '_' characters.");
}
const char* interned_string = symtab_add(lexer->lexing_start, len, hash, &type);
switch (type)
{
case TOKEN_RETURN:
if (lexer->mode == LEX_CONTRACTS) type = TOKEN_IDENT;
break;
default:
break;
}
return new_token(lexer, type, interned_string);
}
// --- Number scanning
/**
* For C3 we use the practice of f<bit-width> u<bit-width> and i<bit-width>
* @param lexer
* @param is_float
* @return
*/
static bool scan_number_suffix(Lexer *lexer, bool *is_float)
{
char c = peek(lexer);
if (!char_is_alphanum_(c)) return true;
switch (c | 32)
{
case 'l':
c = next(lexer);
if (*is_float)
{
return add_error_token_at_current(lexer, "Integer suffix '%c' is not valid for a floating point literal.", c);
}
break;
case 'u':
if (*is_float)
{
return add_error_token_at_current(lexer, "Integer suffix '%c' is not valid for a floating point literal.", c);
}
c = next(lexer);
if ((c | 32) == 'l')
{
c = next(lexer);
break;
}
while (char_is_digit(c = peek(lexer))) next(lexer);
break;
case 'i':
if (*is_float)
{
return add_error_token_at_current(lexer, "Integer suffix '%c' is not valid for a floating point literal.", c);
}
next(lexer);
while (char_is_digit(c = peek(lexer))) next(lexer);
break;
case 'f':
next(lexer);
*is_float = true;
while (char_is_digit(c = peek(lexer))) next(lexer);
break;
default:
break;
}
if (char_is_alphanum_(c))
{
next(lexer);
return add_error_token(lexer, "This doesn't seem to be a valid literal.");
}
return true;
}
#define NEXT_AND_CHECK_NO_MULTIPLE_(lexer__) \
do { if (next(lexer__) == '_' && prev(lexer__) == '_') { \
return add_error_token_at_current(lexer__, "Multiple consecutive '_' are not allowed."); \
} } while(0);
/**
* Parsing octals. Here we depart from the (error prone) C style octals with initial zero e.g. 0231
* Instead we only support 0o prefix like 0o231. Note that lexing here doesn't actually parse the
* number itself.
*/
static bool scan_oct(Lexer *lexer)
{
if (!char_is_oct(peek(lexer)))
{
return add_error_token_at_current(lexer, "An expression starting with '0o' should be followed by octal numbers (0-7).");
}
next(lexer);
while (char_is_oct_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
if (char_is_digit(peek(lexer)))
{
return add_error_token_at_current(lexer, "An expression starting with '0o' should be followed by octal numbers (0-7).");
}
bool is_float = false;
if (!scan_number_suffix(lexer, &is_float)) return false;
if (is_float)
{
return add_error_token(lexer, "Octal literals cannot have a floating point suffix.");
}
return new_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
}
/**
* Binary style literals e.g. 0b10101011
**/
static bool scan_binary(Lexer *lexer)
{
if (!char_is_binary(peek(lexer)))
{
return add_error_token_at_current(lexer, "An expression starting with '0b' should be followed by binary digits (0-1).");
}
next(lexer);
while (char_is_binary_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
if (char_is_digit(peek((lexer))))
{
return add_error_token_at_current(lexer, "An expression starting with '0b' should be followed by binary digits (0-1).");
}
bool is_float = false;
if (!scan_number_suffix(lexer, &is_float)) return false;
if (is_float)
{
return add_error_token(lexer, "Binary literals cannot have a floating point suffix.");
}
return new_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
}
/**
* Scan the digit after the exponent, e.g +12 or -12 or 12
* @param lexer
* @return false if lexing failed.
*/
static inline bool scan_exponent(Lexer *lexer)
{
// Step past e/E or p/P
next(lexer);
char c = peek(lexer);
next(lexer);
// Step past +/-
if (c == '+' || c == '-')
{
c = peek(lexer);
next(lexer);
}
// Now we need at least one digit
if (!char_is_digit(c))
{
if (c == 0)
{
backtrack(lexer);
return add_error_token_at_current(lexer, "End of file was reached while parsing the exponent.");
}
if (c == '\n') return add_error_token(lexer, "End of line was reached while parsing the exponent.");
if (c < 31 || c > 127) add_error_token(lexer, "An unexpected character was found while parsing the exponent.");
return add_error_token(lexer, "Parsing the floating point exponent failed, because '%c' is not a number.", c);
}
// Step through all the digits.
while (char_is_digit(peek(lexer))) next(lexer);
return true;
}
/**
* Scan a hex number, including floating point hex numbers of the format 0x31a31ff.21p12. Note that the
* exponent is written in decimal.
**/
static inline bool scan_hex(Lexer *lexer)
{
if (!char_is_hex(peek(lexer)))
{
return add_error_token_at_current(lexer, "'0x' starts a hexadecimal number, so the next character should be 0-9, a-f or A-F.");
}
next(lexer);
while (char_is_hex_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
bool is_float = false;
if (peek(lexer) == '.' && peek_next(lexer) != '.')
{
is_float = true;
next(lexer);
char c = peek(lexer);
if (c == '_') return add_error_token_at_current(lexer, "'_' is not allowed directly after decimal point, try removing it.");
if (char_is_hex(c)) next(lexer);
while (char_is_hex_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
}
char c = peek(lexer);
if (c == 'p' || c == 'P')
{
is_float = true;
if (!scan_exponent(lexer)) return false;
}
if (prev(lexer) == '_')
{
backtrack(lexer);
return add_error_token_at_current(lexer, "The number ended with '_', which isn't allowed, please remove it.");
}
if (!scan_number_suffix(lexer, &is_float)) return false;
return new_token(lexer, is_float ? TOKEN_REAL : TOKEN_INTEGER, lexer->lexing_start);
}
/**
* Scans integer and float decimal values.
*/
static inline bool scan_dec(Lexer *lexer)
{
ASSERT0(char_is_digit(peek(lexer)));
// Walk through the digits, we don't need to worry about
// initial _ because we only call this if we have a digit initially.
while (char_is_digit_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
// Assume no float.
bool is_float = false;
// If we have a single dot, we assume that we have a float.
// Note that this current parsing means we can't have functions on
// literals, like "123.sizeof", but we're fine with that.
if (peek(lexer) == '.' && peek_next(lexer) != '.')
{
is_float = true;
// Step past '.'
next(lexer);
// Check our rule to disallow 123._32
char c = peek(lexer);
if (c == '_') return add_error_token_at_current(lexer, "'_' is not allowed directly after decimal point, try removing it.");
// Now walk until we see no more digits.
// This allows 123. as a floating point number.
while (char_is_digit_or_(peek(lexer))) NEXT_AND_CHECK_NO_MULTIPLE_(lexer);
}
char c = peek(lexer);
// We might have an exponential. We allow 123e1 and 123.e1 as floating point, so
// just set it to floating point and check the exponential.
if (c == 'e' || c == 'E')
{
is_float = true;
if (!scan_exponent(lexer)) return false;
}
if (prev(lexer) == '_')
{
backtrack(lexer);
return add_error_token_at_current(lexer, "The number ended with '_', which isn't allowed, please remove it.");
}
if (!scan_number_suffix(lexer, &is_float)) return false;
return new_token(lexer, is_float ? TOKEN_REAL : TOKEN_INTEGER, lexer->lexing_start);
}
/**
* Scan a digit, switching on initial zero on possible parsing schemes:
* 0x... -> Hex
* 0o... -> Octal
* 0b... -> Binary
*
* Default is decimal.
*
* It's actually pretty simple to add encoding schemes here, so for example Base64 could
* be added.
*/
static inline bool scan_digit(Lexer *lexer)
{
if (peek(lexer) == '0')
{
switch (peek_next(lexer))
{
case 'x':
case 'X':
skip(lexer, 2);
return scan_hex(lexer);
case 'o':
case 'O':
skip(lexer, 2);
return scan_oct(lexer);
case 'b':
case 'B':
skip(lexer, 2);
return scan_binary(lexer);
default:
break;
}
}
return scan_dec(lexer);
}
// --- Character & string scan
static inline int64_t scan_hex_literal(Lexer *lexer, int positions)
{
int64_t hex = 0;
for (int j = 0; j < positions; j++)
{
hex <<= 4U;
int i = char_hex_to_nibble(peek(lexer));
if (i < 0)
{
return -1;
}
next(lexer);
hex += i;
}
return hex;
}
static inline int64_t scan_utf8(Lexer *lexer, unsigned char c)
{
int utf8_bytes;
uint64_t result;
if (c < 0xc0) goto ERROR;
if (c <= 0xdf)
{
result = 0x1f & c;
utf8_bytes = 2;
}
else if (c <= 0xef)
{
result = 0xf & c;
utf8_bytes = 3;
}
else if (c <= 0xf7)
{
utf8_bytes = 4;
result = 0x7 & c;
}
else if (c <= 0xfb)
{
utf8_bytes = 5;
result = 0x3 & c;
}
else if (c <= 0xfd)
{
utf8_bytes = 6;
result = 0x1 & c;
}
else
{
goto ERROR;
}
for (int i = 1; i < utf8_bytes; i++)
{
result <<= 6U;
c = (unsigned char)peek(lexer);
if (c == '\0') return 0xFFFD;
next(lexer);
if ((c & 0xc0) != 0x80)
{
goto ERROR;
}
result += c & 0x3f;
}
return (int64_t)result;
ERROR:
add_error_token(lexer, "Invalid UTF-8 sequence.");
return -1;
}
/**
* Rules:
* 1. If ASCII or \xAB, accept up to 8 characters (16 with Int128 support), size is char, ushort, uint, ulong, UInt128
* 2. If UTF8, accept 1 UTF character, size is ushort normally, ulong on wide UTF characters, no additional characters accepted.
* 3. If \uABCD, convert to 16 bits, size is ushort, no additional characters accepted.
* 4. If \U01234567, convert to 32 bits, size is uint, no additional characters accepted.
*
* @param lexer
* @return
*/
static inline bool scan_char(Lexer *lexer)
{
// Handle the problem with zero size character literal first.
if (match(lexer, '\''))
{
return add_error_token(lexer, "The character literal was empty.");
}
int width = 0;
char c;
Int128 b = { 0, 0 };
while (!match(lexer, '\''))
{
c = peek(lexer);
next(lexer);
// End of file may occur:
if (c == '\0')
{
return add_error_token_at_start(lexer, "The character literal did not terminate.");
}
// We might exceed the width that we allow.
if (width > 15) return add_error_token_at_start(lexer, "The character literal exceeds 16 characters.");
// Handle (expected) utf-8 characters.
if ((unsigned)c >= (unsigned)0x80)
{
if (width != 0) goto UNICODE_IN_MULTI;
int64_t utf8 = scan_utf8(lexer, (unsigned char)c);
if (utf8 < 0) return false;
if (!match(lexer, '\''))
{
if (peek(lexer) == '\0') continue;
backtrack(lexer);
return add_error_token_at_current(lexer, "Unicode character literals may only contain one character, "
"please remove the additional ones or use all ASCII.");
}
b.low = (uint64_t) utf8;
width = utf8 > 0xffff ? 4 : 2;
goto DONE;
}
// Parse the escape code
signed char escape = ' ';
if (c == '\\')
{
ASSERT0(c == '\\');
c = peek(lexer);
escape = char_is_valid_escape(c);
if (escape == -1)
{
lexer->lexing_start += 1;
if (c > ' ' && c <= 127)
{
next(lexer);
return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
}
return add_error_token_at_current(lexer, "An escape sequence was expected after '\\'.");
}
next(lexer);
}
const char *escape_begin = lexer->current - 2;
switch (escape)
{
case 'x':
{
int64_t hex = scan_hex_literal(lexer, 2);
if (hex < 0)
{
return add_error_token_at(lexer, escape_begin, lexer->current - escape_begin, "Expected a two character hex value after \\x.");
}
// We can now reassign c and use the default code.
c = (char)hex;
break;
}
case 'u':
case 'U':
{
// First check that we don't have any characters previous to this one.
if (width != 0) goto UNICODE_IN_MULTI;
int bytes = escape == 'U' ? 4 : 2;
int64_t hex = scan_hex_literal(lexer, bytes * 2);
// The hex parsing may have failed, lacking more hex chars.
if (hex < 0)
{
begin_new_token(lexer);
return add_error_token_at(lexer, escape_begin, lexer->current - escape_begin,
"Expected %s character hex value after \\%c.",
escape == 'u' ? "a four" : "an eight", escape);
}
// If we don't see the end here, then something is wrong.
if (!match(lexer, '\''))
{
// It may be the end of the line, if so use the default handling by invoking "continue"
if (peek(lexer) == '\0') continue;
return add_error_token_at_current(lexer,
"Character literals with '\\%c' can only contain one character, please remove this one.",
escape);
}
// Assign the value and go to "DONE".
b.low = (uint64_t) hex;
width = bytes;
goto DONE;
}
case ' ':
// No escape, a regular character.
break;
default:
c = (signed char)escape;
break;
}
// Default handling here:
width++;
b = i128_shl64(b, 8);
b = i128_add64(b, (unsigned char)c);
}
ASSERT0(width > 0 && width <= 16);
DONE:
set_generic_token(lexer, TOKEN_CHAR_LITERAL);
lexer->data.char_value = b;
lexer->data.width = (char)width;
return true;
UNICODE_IN_MULTI:
return add_error_token(lexer, "A multi-character literal may not contain unicode characters.");
}
static int append_esc_string_token(char *restrict dest, const char *restrict src, size_t *pos)
{
int scanned;
uint64_t unicode_char;
signed char scanned_char = char_is_valid_escape(src[0]);
if (scanned_char < 0) return -1;
switch (scanned_char)
{
case 'x':
{
int h = char_hex_to_nibble(src[1]);
if (h < 0) return -1;
int l = char_hex_to_nibble(src[2]);
if (l < 0) return -1;
unicode_char = ((unsigned) h << 4U) + (unsigned)l;
scanned = 3;
break;
}
case 'u':
{
int x1 = char_hex_to_nibble(src[1]);
if (x1 < 0) return -1;
int x2 = char_hex_to_nibble(src[2]);
if (x2 < 0) return -1;
int x3 = char_hex_to_nibble(src[3]);
if (x3 < 0) return -1;
int x4 = char_hex_to_nibble(src[4]);
if (x4 < 0) return -1;
unicode_char = ((unsigned) x1 << 12U) + ((unsigned) x2 << 8U) + ((unsigned) x3 << 4U) + (unsigned)x4;
scanned = 5;
break;
}
case 'U':
{
int x1 = char_hex_to_nibble(src[1]);
if (x1 < 0) return -1;
int x2 = char_hex_to_nibble(src[2]);
if (x2 < 0) return -1;
int x3 = char_hex_to_nibble(src[3]);
if (x3 < 0) return -1;
int x4 = char_hex_to_nibble(src[4]);
if (x4 < 0) return -1;
int x5 = char_hex_to_nibble(src[5]);
if (x5 < 0) return -1;
int x6 = char_hex_to_nibble(src[6]);
if (x6 < 0) return -1;
int x7 = char_hex_to_nibble(src[7]);
if (x7 < 0) return -1;
int x8 = char_hex_to_nibble(src[8]);
if (x8 < 0) return -1;
unicode_char = ((unsigned) x1 << 28U) + ((unsigned) x2 << 24U) + ((unsigned) x3 << 20U) + ((unsigned) x4 << 16U) +
((unsigned) x5 << 12U) + ((unsigned) x6 << 8U) + ((unsigned) x7 << 4U) + (unsigned)x8;
scanned = 9;
break;
}
default:
dest[(*pos)++] = scanned_char;
return 1;
}
if (unicode_char < 0x80U)
{
dest[(*pos)++] = (char)unicode_char;
}
else if (unicode_char < 0x800U)
{
dest[(*pos)++] = (char)(0xC0U | (unicode_char >> 6U));
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
}
else if (unicode_char < 0x10000U)
{
dest[(*pos)++] = (char)(0xE0U | (unicode_char >> 12U));
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 6U) & 0x3FU));
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
}
else
{
dest[(*pos)++] = (char)(0xF0U | (unicode_char >> 18U));
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 12U) & 0x3FU));
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 6U) & 0x3FU));
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
}
return scanned;
}
static inline void consume_to_end_quote(Lexer *lexer)
{
char c;
while ((c = peek(lexer)) != '\0' && c != '"')
{
next(lexer);
}
}
static inline bool scan_string(Lexer *lexer)
{
char c = 0;
const char *current = lexer->current;
while ((c = *(current++)) != '"')
{
if (c == '\n' || c == '\0')
{
current++;
break;
}
if (c == '\\')
{
c = *current;
if (c != '\n' && c != '\0') current++;
continue;
}
}
const char *end = current - 1;
char *destination = malloc_string((size_t)(end - lexer->current + 1));
size_t len = 0;
while (lexer->current < end)
{
c = peek(lexer);
next(lexer);
if (c == '\0' || (c == '\\' && peek(lexer) == '\0'))
{
if (c == '\0') backtrack(lexer);
add_error_token_at_start(lexer, "The end of the file was reached "
"while parsing the string. "
"Did you forget (or accidentally add) a '\"' somewhere?");
consume_to_end_quote(lexer);
return false;
}
if (c == '\n' || (c == '\\' && peek(lexer) == '\n'))
{
backtrack(lexer);
add_error_token_at_start(lexer, "The end of the line was reached "
"while parsing the string. "
"Did you forget (or accidentally add) a '\"' somewhere?");
consume_to_end_quote(lexer);
return false;
}
if (c == '\\')
{
int scanned = append_esc_string_token(destination, lexer->current, &len);
if (scanned < 0)
{
add_error_token_at_current(lexer, "Invalid escape in string.");
consume_to_end_quote(lexer);
return false;
}
skip(lexer, scanned);
continue;
}
destination[len++] = c;
}
// Skip the `"`
next(lexer);
destination[len] = 0;
new_token(lexer, TOKEN_STRING, destination);
lexer->data.strlen = len;
return true;
}
static inline bool scan_raw_string(Lexer *lexer)
{
char c;
while (1)
{
c = peek(lexer);
next(lexer);
if (c == '`' && peek(lexer) != '`') break;
if (c == '\0')
{
return add_error_token_at_start(lexer, "Reached the end of the file looking for "
"the end of the raw string that starts "
"here. Did you forget a '`' somewhere?");
}
if (c == '`') next(lexer);
}
const char *current = lexer->lexing_start + 1;
const char *end = lexer->current - 1;
size_t len = (size_t)(end - current);
char *destination = malloc_string(len + 1);
len = 0;
while (current < end)
{
c = *(current++);
if (c == '`' && current[0] == '`')
{
current++;
}
destination[len++] = c;
}
destination[len] = 0;
new_token(lexer, TOKEN_STRING, destination);
lexer->data.strlen = len;
return true;
}
static inline bool scan_hex_array(Lexer *lexer)
{
char start_char = peek(lexer);
next(lexer); // Step past ' or " `
char c;
uint64_t len = 0;
while (1)
{
c = peek(lexer);
if (c == 0)
{
return add_error_token_at_current(lexer, "The hex string seems to be missing a terminating '%c'", start_char);
}
if (c == start_char) break;
if (char_is_hex(c))
{
next(lexer);
len++;
continue;
}
if (char_is_whitespace(c))
{
next(lexer);
continue;
}
if (c > ' ' && c < 127)
{
return add_error_token_at_current(lexer,
"'%c' isn't a valid hexadecimal digit, all digits should be a-z, A-Z and 0-9.",
c);
}
return add_error_token_at_current(lexer,
"This isn't a valid hexadecimal digit, all digits should be a-z, A-Z and 0-9.");
}
next(lexer);
if (len % 2)
{
return add_error_token(lexer, "The hexadecimal string is not an even length, did you miss a digit somewhere?");
}
if (!new_token(lexer, TOKEN_BYTES, lexer->lexing_start)) return false;
lexer->data.is_base64 = false;
lexer->data.bytes_len = (uint64_t)len / 2;
return true;
}
// Scan b64"abc=" and b64'abc='
static inline bool scan_base64(Lexer *lexer)
{
next(lexer); // Step past 6
next(lexer); // Step past 4
char start_char = peek(lexer);
next(lexer); // Step past ' or " or `
char c;
unsigned end_len = 0;
uint64_t len = 0;
while (1)
{
c = peek(lexer);
if (c == 0)
{
return add_error_token_at_start(lexer, "The base64 string seems to be missing a terminating '%c'", start_char);
}
next(lexer);
if (c == start_char) break;
if (char_is_base64(c))
{
if (end_len)
{
return add_error_token_at_current(lexer, "'%c' can't be placed after an ending '='", c);
}
len++;
continue;
}
if (c == '=')
{
if (end_len > 1)
{
return add_error_token_at_current(lexer, "There cannot be more than 2 '=' at the end of a base64 string.", c);
}
end_len++;
continue;
}
if (!char_is_whitespace(c))
{
if (c < ' ' || c > 127)
{
return add_error_token_at_current(lexer, "A valid base64 character was expected here.");
}
return add_error_token_at_current(lexer, "'%c' is not a valid base64 character.", c);
}
}
if (!end_len && len % 4 != 0)
{
switch (len % 4)
{
case 0:
case 1:
// Invalid
break;
case 2:
end_len = 2;
break;
case 3:
end_len = 1;
break;
default:
UNREACHABLE
}
if (len % 4 == 3)
{
end_len = 1;
}
}
if ((len + end_len) % 4 != 0)
{
return add_error_token_at_start(lexer, "Base64 strings must either be padded to multiple of 4, or if unpadded "
"- only need 1 or 2 bytes of extra padding.");
}
uint64_t decoded_len = (3 * len - end_len) / 4;
if (!new_token(lexer, TOKEN_BYTES, lexer->lexing_start)) return false;
lexer->data.is_base64 = true;
lexer->data.bytes_len = decoded_len;
return true;
}
// --- Lexer doc lexing
/**
* Parse the <* *> directives comments
**/
static bool parse_doc_start(Lexer *lexer)
{
bool may_have_contract = true;
// Let's loop until we find the end or the contract.
while (!reached_end(lexer))
{
char c = peek(lexer);
switch (c)
{
case '\n':
may_have_contract = true;
next(lexer);
continue;
case ' ':
case '\t':
next(lexer);
continue;
case '*':
// We might have <* Hello *>
if (peek_next(lexer) == '>') goto EXIT;
may_have_contract = false;
next(lexer);
continue;
case '@':
if (may_have_contract && char_is_lower(peek_next(lexer)))
{
// Found a contract
goto EXIT;
}
FALLTHROUGH;
default:
may_have_contract = false;
next(lexer);
continue;
}
}
EXIT:;
// Now we either found:
// 1. "<* foo \n @param"
// 2. "<* foo *>"
// 3. "<* foo <eof>"
//
// In any case we can consider this having reached "the contracts"
lexer->mode = LEX_CONTRACTS;
return new_token(lexer, TOKEN_DOCS_START, lexer->lexing_start);
}
static bool lexer_scan_token_inner(Lexer *lexer)
{
// Now skip the whitespace.
skip_whitespace(lexer);
// Point start to the first non-whitespace character.
begin_new_token(lexer);
if (reached_end(lexer)) return new_token(lexer, TOKEN_EOF, "\n") && false;
char c = peek(lexer);
next(lexer);
switch (c)
{
case '\n':
ASSERT0(lexer->mode == LEX_CONTRACTS);
return new_token(lexer, TOKEN_DOCS_EOL, "<eol>");
case '@':
if (char_is_letter_(peek(lexer)))
{
return scan_ident(lexer, TOKEN_AT_IDENT, TOKEN_AT_CONST_IDENT, TOKEN_AT_TYPE_IDENT, '@');
}
return new_token(lexer, TOKEN_AT, "@");
case '\'':
return scan_char(lexer);
case '`':
return scan_raw_string(lexer);
case '"':
return scan_string(lexer);
case '#':
return scan_ident(lexer, TOKEN_HASH_IDENT, TOKEN_HASH_CONST_IDENT, TOKEN_HASH_TYPE_IDENT, '#');
case '$':
if (match(lexer, '$'))
{
if (char_is_letter(peek(lexer)))
{
return new_token(lexer, TOKEN_BUILTIN, "$$");
}
return add_error_token_at_current(lexer, "Expected a letter after $$.");
}
return scan_ident(lexer, TOKEN_CT_IDENT, TOKEN_CT_CONST_IDENT, TOKEN_CT_TYPE_IDENT, '$');
case ',':
return new_token(lexer, TOKEN_COMMA, ",");
case ';':
return new_token(lexer, TOKEN_EOS, ";");
case '{':
return match(lexer, '|') ? new_token(lexer, TOKEN_LBRAPIPE, "{|") : new_token(lexer, TOKEN_LBRACE, "{");
case '}':
return new_token(lexer, TOKEN_RBRACE, "}");
case '(':
return match(lexer, '<') ? new_token(lexer, TOKEN_LGENPAR, "(<") : new_token(lexer, TOKEN_LPAREN, "(");
case ')':
return new_token(lexer, TOKEN_RPAREN, ")");
case '[':
if (match(lexer, '<')) return new_token(lexer, TOKEN_LVEC, "[<");
return new_token(lexer, TOKEN_LBRACKET, "[");
case ']':
return new_token(lexer, TOKEN_RBRACKET, "]");
case '.':
if (match(lexer, '.'))
{
if (match(lexer, '.')) return new_token(lexer, TOKEN_ELLIPSIS, "...");
return new_token(lexer, TOKEN_DOTDOT, "..");
}
return new_token(lexer, TOKEN_DOT, ".");
case '~':
return new_token(lexer, TOKEN_BIT_NOT, "~");
case ':':
return match(lexer, ':') ? new_token(lexer, TOKEN_SCOPE, "::") : new_token(lexer, TOKEN_COLON, ":");
case '!':
if (match(lexer, '!')) return new_token(lexer, TOKEN_BANGBANG, "!!");
return match(lexer, '=') ? new_token(lexer, TOKEN_NOT_EQUAL, "!=") : new_token(lexer, TOKEN_BANG, "!");
case '/':
return match(lexer, '=') ? new_token(lexer, TOKEN_DIV_ASSIGN, "/=") : new_token(lexer, TOKEN_DIV, "/");
case '*':
if (lexer->mode == LEX_CONTRACTS && match(lexer, '>'))
{
lexer->mode = LEX_NORMAL;
return new_token(lexer, TOKEN_DOCS_END, "*>");
}
return match(lexer, '=') ? new_token(lexer, TOKEN_MULT_ASSIGN, "*=") : new_token(lexer, TOKEN_STAR, "*");
case '=':
if (match(lexer, '>')) return new_token(lexer, TOKEN_IMPLIES, "=>");
return match(lexer, '=') ? new_token(lexer, TOKEN_EQEQ, "==") : new_token(lexer, TOKEN_EQ, "=");
case '^':
return match(lexer, '=') ? new_token(lexer, TOKEN_BIT_XOR_ASSIGN, "^=") : new_token(lexer, TOKEN_BIT_XOR, "^");
case '?':
if (match(lexer, '?')) return new_token(lexer, TOKEN_QUESTQUEST, "??");
return match(lexer, ':') ? new_token(lexer, TOKEN_ELVIS, "?:") : new_token(lexer, TOKEN_QUESTION, "?");
case '<':
if (match(lexer, '<'))
{
if (match(lexer, '=')) return new_token(lexer, TOKEN_SHL_ASSIGN, "<<=");
return new_token(lexer, TOKEN_SHL, "<<");
}
if (lexer->mode == LEX_NORMAL && match(lexer, '*'))
{
return parse_doc_start(lexer);
}
return match(lexer, '=') ? new_token(lexer, TOKEN_LESS_EQ, "<=") : new_token(lexer, TOKEN_LESS, "<");
case '>':
if (match(lexer, '>'))
{
if (match(lexer, '=')) return new_token(lexer, TOKEN_SHR_ASSIGN, ">>=");
return new_token(lexer, TOKEN_SHR, ">>");
}
if (match(lexer, ')')) return new_token(lexer, TOKEN_RGENPAR, ">)");
if (match(lexer, ']')) return new_token(lexer, TOKEN_RVEC, ">]");
return match(lexer, '=') ? new_token(lexer, TOKEN_GREATER_EQ, ">=") : new_token(lexer, TOKEN_GREATER, ">");
case '%':
return match(lexer, '=') ? new_token(lexer, TOKEN_MOD_ASSIGN, "%=") : new_token(lexer, TOKEN_MOD, "%");
case '&':
if (match(lexer, '&'))
{
return match(lexer, '&') ? new_token(lexer, TOKEN_CT_AND, "&&&") : new_token(lexer, TOKEN_AND, "&&");
}
return match(lexer, '=') ? new_token(lexer, TOKEN_BIT_AND_ASSIGN, "&=") : new_token(lexer, TOKEN_AMP, "&");
case '|':
if (match(lexer, '}')) return new_token(lexer, TOKEN_RBRAPIPE, "|}");
if (match(lexer, '|'))
{
return match(lexer, '|') ? new_token(lexer, TOKEN_CT_OR, "|||") : new_token(lexer, TOKEN_OR, "||");
}
return match(lexer, '=') ? new_token(lexer, TOKEN_BIT_OR_ASSIGN, "|=") : new_token(lexer, TOKEN_BIT_OR,
"|");
case '+':
if (match(lexer, '+'))
{
if (match(lexer, '+')) return new_token(lexer, TOKEN_CT_CONCAT, "+++");
return new_token(lexer, TOKEN_PLUSPLUS, "++");
}
if (match(lexer, '=')) return new_token(lexer, TOKEN_PLUS_ASSIGN, "+=");
return new_token(lexer, TOKEN_PLUS, "+");
case '-':
if (match(lexer, '>')) return new_token(lexer, TOKEN_ARROW, "->");
if (match(lexer, '-')) return new_token(lexer, TOKEN_MINUSMINUS, "--");
if (match(lexer, '=')) return new_token(lexer, TOKEN_MINUS_ASSIGN, "-=");
return new_token(lexer, TOKEN_MINUS, "-");
case 'x':
if ((peek(lexer) == '"' || peek(lexer) == '\'' || peek(lexer) == '`'))
{
return scan_hex_array(lexer);
}
goto IDENT;
case 'b':
if (peek(lexer) == '6' && peek_next(lexer) == '4' && (lexer->current[2] == '\'' || lexer->current[2] == '"' || lexer->current[2] == '`'))
{
return scan_base64(lexer);
}
goto IDENT;
case '_':
IDENT:
backtrack(lexer);
return scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
default:
if (c >= '0' && c <= '9')
{
backtrack(lexer);
return scan_digit(lexer);
}
if (c >= 'a' && c <= 'z') goto IDENT;
if (c >= 'A' && c <= 'Z') goto IDENT;
if (c < 0)
{
return add_error_token(lexer, "The 0x%x character may not be placed outside of a string or comment, did you forget a \" somewhere?", (uint8_t)c);
}
return add_error_token(lexer, "'%c' may not be placed outside of a string or comment, did you perhaps forget a \" somewhere?", c);
}
}
INLINE void check_bidirectional_markers(Lexer *lexer)
{
// First we check for bidirectional markers.
const unsigned char *check = (const unsigned char *)lexer->current;
unsigned c;
int balance = 0;
// Loop until end.
while ((c = *(check++)) != '\0')
{
if (c != 0xE2) continue;
// Possible marker.
unsigned char next = check[0];
if (next == 0) break;
unsigned char type = check[1];
switch (check[0])
{
case 0x80:
if (type == 0xAC)
{
balance--;
if (balance < 0) goto DONE;
}
if (type >= 0xAA && type <= 0xAE)
{
balance++;
}
break;
case 0x81:
if (type >= 0xA6 && type <= 0xA8)
{
balance++;
}
else if (type == 0xA9)
{
balance--;
if (balance < 0) goto DONE;
}
break;
default:
break;
}
}
DONE:
// Check for unbalanced result
if (balance != 0)
{
add_error_token_at_start(lexer, "Invalid encoding - Unbalanced bidirectional markers.");
return;
}
}
// Initialize a file
void lexer_init(Lexer *lexer)
{
// Set the current file.
lexer->file_begin = lexer->file->contents;
// Set current to beginning.
lexer->current = lexer->file_begin;
// Line start is current.
lexer->line_start = lexer->current;
// Row number starts at 1
lexer->current_row = 1;
// File id is the current file.
lexer->tok_span.file_id = lexer->file->file_id;
// Mode is NORMAL
lexer->mode = LEX_NORMAL;
// Set up lexing for a new token.
begin_new_token(lexer);
// Check for bidirectional markers.
check_bidirectional_markers(lexer);
}
bool lexer_next_token(Lexer *lexer)
{
// Scan for a token.
if (lexer_scan_token_inner(lexer)) return true;
// Failed, so check if we're at end:
if (reached_end(lexer)) return true;
// Scan through the rest of the text for other invalid tokens:
bool token_is_ok = false;
do
{
if (!token_is_ok)
{
// Scan to the end of the line if we have an error.
while (!reached_end(lexer) && peek(lexer) != '\n') next(lexer);
}
token_is_ok = lexer_scan_token_inner(lexer);
}
while (!reached_end(lexer));
// Done.
return false;
}