mirror of
https://github.com/c3lang/c3c.git
synced 2026-02-27 12:01:16 +00:00
1807 lines
47 KiB
C
1807 lines
47 KiB
C
// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
|
||
// Use of this source code is governed by the GNU LGPLv3.0 license
|
||
// a copy of which can be found in the LICENSE file.
|
||
|
||
#include "compiler_internal.h"
|
||
|
||
typedef enum
|
||
{
|
||
LEX_NORMAL,
|
||
LEX_DOCS,
|
||
} LexMode;
|
||
|
||
typedef enum
|
||
{
|
||
DOC_END_EOF,
|
||
DOC_END_LAST,
|
||
DOC_END_EOL,
|
||
DOC_END_ERROR,
|
||
} DocEnd;
|
||
|
||
// --- Lexing general methods.
|
||
|
||
static bool lexer_scan_token_inner(Lexer *lexer, LexMode mode);
|
||
|
||
// Peek at the current character in the buffer.
|
||
static inline char peek(Lexer *lexer)
|
||
{
|
||
return *lexer->current;
|
||
}
|
||
|
||
// Look at the prev character in the buffer.
|
||
static inline char prev(Lexer *lexer)
|
||
{
|
||
return lexer->current[-1];
|
||
}
|
||
|
||
// Backtrack the buffer read one step.
|
||
static inline void backtrack(Lexer *lexer)
|
||
{
|
||
lexer->current--;
|
||
}
|
||
|
||
// Store a line ending (and current line start at the current character)
|
||
void lexer_store_line_end(Lexer *lexer)
|
||
{
|
||
lexer->current_line++;
|
||
lexer->line_start = lexer->current + 1;
|
||
source_file_append_line_end(lexer->current_file, (SourceLoc)(lexer->current_file->start_id + lexer->current - lexer->file_begin));
|
||
}
|
||
|
||
// Peek one character ahead.
|
||
static inline char peek_next(Lexer *lexer)
|
||
{
|
||
return lexer->current[1];
|
||
}
|
||
|
||
// Return the current character and step one character forward.
|
||
static inline char next(Lexer *lexer)
|
||
{
|
||
return *(lexer->current++);
|
||
}
|
||
|
||
// Skip the x next characters.
|
||
static inline void skip(Lexer *lexer, int steps)
|
||
{
|
||
assert(steps > 0);
|
||
lexer->current += steps;
|
||
}
|
||
|
||
// Is the current character '\0' if so we assume we reached the end.
|
||
static inline bool reached_end(Lexer *lexer)
|
||
{
|
||
return *lexer->current == '\0';
|
||
}
|
||
|
||
// Match a single character – if successful, more one step forward.
|
||
static inline bool match(Lexer *lexer, char expected)
|
||
{
|
||
if (reached_end(lexer)) return false;
|
||
if (*lexer->current != expected) return false;
|
||
lexer->current++;
|
||
return true;
|
||
}
|
||
|
||
// --- Token creation
|
||
|
||
/**
|
||
* Allocate data for a token, including source location.
|
||
* This call is doing the basic allocation, with other functions
|
||
* filling out additional information.
|
||
**/
|
||
static inline void add_generic_token(Lexer *lexer, TokenType type)
|
||
{
|
||
// Allocate source location, type, data for the token
|
||
// each of these use their own arena,
|
||
// causing them to be allocated directly into
|
||
// what amounts to a huge array.
|
||
// Consequently these allocs are actually simultaneously
|
||
// allocating data and putting that data in an array.
|
||
SourceLocation *location = sourceloc_alloc();
|
||
unsigned char *token_type = (unsigned char *)toktype_alloc();
|
||
TokenData *data = tokdata_alloc();
|
||
*token_type = (unsigned char)type;
|
||
|
||
// Set the location.
|
||
location->file = lexer->current_file;
|
||
location->start = (uint32_t)(lexer->lexing_start - lexer->file_begin);
|
||
|
||
// Calculate the column
|
||
if (lexer->lexing_start < lexer->line_start)
|
||
{
|
||
// In this case lexing started before the start of the current line.
|
||
// Start by looking at the previous line.
|
||
SourceLoc *current = &lexer->current_file->lines[lexer->current_line - 1];
|
||
location->line = lexer->current_line;
|
||
// Walk upwards until we find a line that starts before the current.
|
||
while (*current > location->start)
|
||
{
|
||
location->line--;
|
||
current--;
|
||
}
|
||
// We found the line we wanted, so the col is just an offset from the start.
|
||
location->col = location->start - *current + 1;
|
||
// Length is restricted to the end of the line.
|
||
location->length = current[1] - current[0] - 1;
|
||
}
|
||
else
|
||
{
|
||
// The simple case, where the parsing started on the current line.
|
||
location->line = lexer->current_line;
|
||
// Col is simple difference.
|
||
location->col = (unsigned) (lexer->lexing_start - lexer->line_start) + 1;
|
||
// Start is offset to file begin.
|
||
location->start = (SourceLoc) (lexer->lexing_start - lexer->file_begin);
|
||
// Length is diff between current and start.
|
||
location->length = (SourceLoc) (lexer->current - lexer->lexing_start);
|
||
}
|
||
// Return pointers to the data and the location,
|
||
// these maybe be used to fill in data.
|
||
lexer->latest_token_data = data;
|
||
lexer->latest_token_loc = location;
|
||
lexer->latest_token_type = token_type;
|
||
}
|
||
|
||
// Error? We simply generate an invalid token and print out the error.
|
||
static bool add_error_token(Lexer *lexer, const char *message, ...)
|
||
{
|
||
add_generic_token(lexer, TOKEN_INVALID_TOKEN);
|
||
va_list list;
|
||
va_start(list, message);
|
||
sema_verror_range(lexer->latest_token_loc, message, list);
|
||
va_end(list);
|
||
return false;
|
||
}
|
||
|
||
static bool add_error_token_at(Lexer *lexer, const char *loc, uint32_t len, const char *message, ...)
|
||
{
|
||
va_list list;
|
||
va_start(list, message);
|
||
SourceLocation location = { .file = lexer->current_file,
|
||
.start = (uint32_t) (loc - lexer->file_begin),
|
||
.line = lexer->current_line,
|
||
.length = len,
|
||
.col = (uint32_t) (loc - lexer->line_start) + 1,
|
||
};
|
||
sema_verror_range(&location, message, list);
|
||
va_end(list);
|
||
add_generic_token(lexer, TOKEN_INVALID_TOKEN);
|
||
return false;
|
||
|
||
}
|
||
// Add a new regular token.
|
||
static bool add_token(Lexer *lexer, TokenType type, const char *string)
|
||
{
|
||
add_generic_token(lexer, type);
|
||
lexer->latest_token_data->string = string;
|
||
return true;
|
||
}
|
||
|
||
|
||
|
||
// --- Comment parsing
|
||
|
||
/**
|
||
* Parsing of the "//" line comment,
|
||
* also handling "///" doc comments that we probably don't need,
|
||
* but let's keep it for now.
|
||
*/
|
||
static inline bool parse_line_comment(Lexer *lexer)
|
||
{
|
||
// // style comment
|
||
// Skip forward to the end.
|
||
|
||
/// is a doc line comment.
|
||
TokenType comment_type = match(lexer, '/') ? TOKEN_DOC_COMMENT : TOKEN_COMMENT;
|
||
|
||
while (!reached_end(lexer) && peek(lexer) != '\n')
|
||
{
|
||
next(lexer);
|
||
}
|
||
|
||
bool success = add_token(lexer, comment_type, lexer->lexing_start);
|
||
|
||
// If we found EOL, then walk past '\n'
|
||
if (!reached_end(lexer))
|
||
{
|
||
lexer_store_line_end(lexer);
|
||
next(lexer);
|
||
}
|
||
return success;
|
||
}
|
||
|
||
|
||
/**
|
||
* Parse the common / * * / style multiline comments
|
||
**/
|
||
static inline bool parse_multiline_comment(Lexer *lexer)
|
||
{
|
||
TokenType type = peek(lexer) == '*' && peek_next(lexer) != '/' ? TOKEN_DOC_COMMENT : TOKEN_COMMENT;
|
||
int nesting = 1;
|
||
while (1)
|
||
{
|
||
switch (peek(lexer))
|
||
{
|
||
case '*':
|
||
if (peek_next(lexer) == '/')
|
||
{
|
||
skip(lexer, 2);
|
||
nesting--;
|
||
if (nesting == 0) return add_token(lexer, type, lexer->lexing_start);
|
||
continue;
|
||
}
|
||
break;
|
||
case '/':
|
||
if (peek_next(lexer) == '*')
|
||
{
|
||
skip(lexer, 2);
|
||
nesting++;
|
||
continue;
|
||
}
|
||
break;
|
||
case '\n':
|
||
lexer_store_line_end(lexer);
|
||
break;
|
||
case '\0':
|
||
return add_error_token(lexer, "Missing '*/' to end the multiline comment.");
|
||
default:
|
||
break;
|
||
}
|
||
next(lexer);
|
||
}
|
||
}
|
||
|
||
|
||
/**
|
||
* Skip regular whitespace.
|
||
*/
|
||
static void skip_whitespace(Lexer *lexer, LexMode lex_type)
|
||
{
|
||
while (1)
|
||
{
|
||
switch (peek(lexer))
|
||
{
|
||
case '\n':
|
||
if (lex_type != LEX_NORMAL) return;
|
||
lexer_store_line_end(lexer);
|
||
FALLTHROUGH;
|
||
case ' ':
|
||
case '\t':
|
||
case '\f':
|
||
next(lexer);
|
||
break;
|
||
case '\r':
|
||
UNREACHABLE
|
||
default:
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
|
||
// --- Identifier scanning
|
||
|
||
|
||
// Parses identifiers. Note that this is a bit complicated here since
|
||
// we split identifiers into 2 types + find keywords.
|
||
static inline bool scan_ident(Lexer *lexer, TokenType normal, TokenType const_token, TokenType type_token, char prefix)
|
||
{
|
||
TokenType type = (TokenType)0;
|
||
uint32_t hash = FNV1_SEED;
|
||
if (prefix)
|
||
{
|
||
hash = FNV1a(prefix, hash);
|
||
}
|
||
while (peek(lexer) == '_')
|
||
{
|
||
hash = FNV1a(next(lexer), hash);
|
||
}
|
||
while (1)
|
||
{
|
||
switch (peek(lexer))
|
||
{
|
||
case 'a': case 'b': case 'c': case 'd': case 'e':
|
||
case 'f': case 'g': case 'h': case 'i': case 'j':
|
||
case 'k': case 'l': case 'm': case 'n': case 'o':
|
||
case 'p': case 'q': case 'r': case 's': case 't':
|
||
case 'u': case 'v': case 'w': case 'x': case 'y':
|
||
case 'z':
|
||
if (!type)
|
||
{
|
||
type = normal;
|
||
}
|
||
else if (type == const_token)
|
||
{
|
||
type = type_token;
|
||
}
|
||
break;
|
||
case 'A': case 'B': case 'C': case 'D': case 'E':
|
||
case 'F': case 'G': case 'H': case 'I': case 'J':
|
||
case 'K': case 'L': case 'M': case 'N': case 'O':
|
||
case 'P': case 'Q': case 'R': case 'S': case 'T':
|
||
case 'U': case 'V': case 'W': case 'X': case 'Y':
|
||
case 'Z':
|
||
if (!type) type = const_token;
|
||
break;
|
||
case '0': case '1': case '2': case '3': case '4':
|
||
case '5': case '6': case '7': case '8': case '9':
|
||
if (!type) return add_error_token(lexer, "A letter must precede any digit");
|
||
case '_':
|
||
break;
|
||
default:
|
||
goto EXIT;
|
||
}
|
||
hash = FNV1a(next(lexer), hash);
|
||
}
|
||
// Allow bang!
|
||
if (peek(lexer) == '!' && type == normal)
|
||
{
|
||
hash = FNV1a(next(lexer), hash);
|
||
}
|
||
EXIT:;
|
||
uint32_t len = (uint32_t)(lexer->current - lexer->lexing_start);
|
||
if (!type)
|
||
{
|
||
if (!prefix && len == 1) return add_token(lexer, TOKEN_UNDERSCORE, "_");
|
||
add_error_token(lexer, "An identifier may not consist of only '_' characters.");
|
||
}
|
||
const char* interned_string = symtab_add(lexer->lexing_start, len, hash, &type);
|
||
return add_token(lexer, type, interned_string);
|
||
}
|
||
|
||
// --- Number scanning
|
||
|
||
static bool scan_number_suffix(Lexer *lexer, bool *is_float)
|
||
{
|
||
if (!is_alphanum_(peek(lexer))) return true;
|
||
switch (peek(lexer))
|
||
{
|
||
case 'u':
|
||
case 'U':
|
||
case 'I':
|
||
case 'i':
|
||
if (*is_float)
|
||
{
|
||
return add_error_token(lexer, "Integer suffix '%x' is not valid for a floating point literal.", peek(lexer));
|
||
}
|
||
next(lexer);
|
||
while (is_number(peek(lexer))) next(lexer);
|
||
break;
|
||
case 'f':
|
||
*is_float = true;
|
||
next(lexer);
|
||
while (is_number(peek(lexer))) next(lexer);
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
if (is_alphanum_(peek(lexer)))
|
||
{
|
||
return add_error_token(lexer, "This doesn't seem to be a valid literal.");
|
||
}
|
||
return true;
|
||
}
|
||
/**
|
||
* Parsing octals. Here we depart from the (error prone) C style octals with initial zero e.g. 0231
|
||
* Instead we only support 0o prefix like 0o231. Note that lexing here doesn't actually parse the
|
||
* number itself.
|
||
*/
|
||
static bool scan_oct(Lexer *lexer)
|
||
{
|
||
if (!is_oct(next(lexer)))
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token_at(lexer, lexer->current, 1, "An expression starting with '0o' should be followed by octal numbers (0-7).");
|
||
}
|
||
while (is_oct_or_(peek(lexer))) next(lexer);
|
||
bool is_float = false;
|
||
if (!scan_number_suffix(lexer, &is_float)) return false;
|
||
if (is_float)
|
||
{
|
||
return add_error_token(lexer, "Octal literals cannot have a floating point suffix.");
|
||
}
|
||
return add_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
|
||
}
|
||
|
||
/**
|
||
* Binary style literals e.g. 0b10101011
|
||
**/
|
||
static bool scan_binary(Lexer *lexer)
|
||
{
|
||
if (!is_binary(next(lexer)))
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token_at(lexer, lexer->current, 1, "An expression starting with '0b' should be followed by binary digits (0-1).");
|
||
}
|
||
while (is_binary_or_(peek(lexer))) next(lexer);
|
||
bool is_float = false;
|
||
if (!scan_number_suffix(lexer, &is_float)) return false;
|
||
if (is_float)
|
||
{
|
||
return add_error_token(lexer, "Binary literals cannot have a floating point suffix.");
|
||
}
|
||
return add_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
|
||
}
|
||
|
||
/**
|
||
* Scan the digit after the exponent, e.g +12 or -12 or 12
|
||
* @param lexer
|
||
* @return false if lexing failed.
|
||
*/
|
||
static inline bool scan_exponent(Lexer *lexer)
|
||
{
|
||
// Step past e/E or p/P
|
||
next(lexer);
|
||
char c = next(lexer);
|
||
// Step past +/-
|
||
if (c == '+' || c == '-') c = next(lexer);
|
||
// Now we need at least one digit
|
||
if (!is_digit(c))
|
||
{
|
||
if (c == 0)
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token(lexer, "End of file was reached while parsing the exponent.");
|
||
}
|
||
if (c == '\n') return add_error_token(lexer, "End of line was reached while parsing the exponent.");
|
||
if (c < 31 || c > 127) add_error_token(lexer, "An unexpected character was found while parsing the exponent.");
|
||
return add_error_token(lexer, "Parsing the floating point exponent failed, because '%c' is not a number.", c);
|
||
}
|
||
// Walk through all of the digits.
|
||
while (is_digit(peek(lexer))) next(lexer);
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Scan a hex number, including floating point hex numbers of the format 0x31a31ff.21p12. Note that the
|
||
* exponent is written in decimal.
|
||
**/
|
||
static inline bool scan_hex(Lexer *lexer)
|
||
{
|
||
if (!is_hex(next(lexer)))
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token_at(lexer, lexer->current, 1, "'0x' starts a hexadecimal number, so the next character should be 0-9, a-f or A-F.");
|
||
}
|
||
while (is_hex_or_(peek(lexer))) next(lexer);
|
||
bool is_float = false;
|
||
if (peek(lexer) == '.' && peek_next(lexer) != '.')
|
||
{
|
||
is_float = true;
|
||
next(lexer);
|
||
char c = peek(lexer);
|
||
if (c == '_') return add_error_token(lexer, "Can't parse this as a floating point value due to the '_' directly after decimal point.");
|
||
if (is_hex(c)) next(lexer);
|
||
while (is_hex_or_(peek(lexer))) next(lexer);
|
||
}
|
||
char c = peek(lexer);
|
||
if (c == 'p' || c == 'P')
|
||
{
|
||
is_float = true;
|
||
if (!scan_exponent(lexer)) return false;
|
||
}
|
||
if (prev(lexer) == '_') return add_error_token(lexer, "The number ended with '_', but that character needs to be between, not after, digits.");
|
||
if (!scan_number_suffix(lexer, &is_float)) return false;
|
||
return add_token(lexer, is_float ? TOKEN_REAL : TOKEN_INTEGER, lexer->lexing_start);
|
||
}
|
||
|
||
/**
|
||
* Scans integer and float decimal values.
|
||
*/
|
||
static inline bool scan_dec(Lexer *lexer)
|
||
{
|
||
assert(is_digit(peek(lexer)));
|
||
|
||
// Walk through the digits, we don't need to worry about
|
||
// initial _ because we only call this if we have a digit initially.
|
||
while (is_digit_or_(peek(lexer))) next(lexer);
|
||
|
||
// Assume no float.
|
||
bool is_float = false;
|
||
|
||
// If we have a single dot, we assume that we have a float.
|
||
// Note that this current parsing means we can't have functions on
|
||
// literals, like "123.sizeof", but we're fine with that.
|
||
if (peek(lexer) == '.' && peek_next(lexer) != '.')
|
||
{
|
||
is_float = true;
|
||
// Step past '.'
|
||
next(lexer);
|
||
// Check our rule to disallow 123._32
|
||
char c = peek(lexer);
|
||
if (c == '_') return add_error_token(lexer, "Can't parse this as a floating point value due to the '_' directly after decimal point.");
|
||
// Now walk until we see no more digits.
|
||
// This allows 123. as a floating point number.
|
||
while (is_digit_or_(peek(lexer))) next(lexer);
|
||
}
|
||
char c = peek(lexer);
|
||
// We might have an exponential. We allow 123e1 and 123.e1 as floating point, so
|
||
// just set it to floating point and check the exponential.
|
||
if (c == 'e' || c == 'E')
|
||
{
|
||
is_float = true;
|
||
if (!scan_exponent(lexer)) return false;
|
||
}
|
||
|
||
if (prev(lexer) == '_') return add_error_token(lexer, "The number ended with '_', but that character needs to be between, not after, digits.");
|
||
if (!scan_number_suffix(lexer, &is_float)) return false;
|
||
return add_token(lexer, is_float ? TOKEN_REAL : TOKEN_INTEGER, lexer->lexing_start);
|
||
}
|
||
|
||
/**
|
||
* Scan a digit, switching on initial zero on possible parsing schemes:
|
||
* 0x... -> Hex
|
||
* 0o... -> Octal
|
||
* 0b... -> Binary
|
||
*
|
||
* Default is decimal.
|
||
*
|
||
* It's actually pretty simple to add encoding schemes here, so for example Base64 could
|
||
* be added.
|
||
*/
|
||
static inline bool scan_digit(Lexer *lexer)
|
||
{
|
||
if (peek(lexer) == '0')
|
||
{
|
||
switch (peek_next(lexer))
|
||
{
|
||
case 'x':
|
||
case 'X':
|
||
skip(lexer, 2);
|
||
return scan_hex(lexer);
|
||
case 'o':
|
||
case 'O':
|
||
skip(lexer, 2);
|
||
return scan_oct(lexer);
|
||
case 'b':
|
||
case 'B':
|
||
skip(lexer, 2);
|
||
return scan_binary(lexer);
|
||
default:
|
||
break;
|
||
}
|
||
}
|
||
return scan_dec(lexer);
|
||
}
|
||
|
||
// --- Character & string scan
|
||
|
||
static inline int64_t scan_hex_literal(Lexer *lexer, int positions)
|
||
{
|
||
int64_t hex = 0;
|
||
for (int j = 0; j < positions; j++)
|
||
{
|
||
hex <<= 4U;
|
||
int i = char_to_nibble(peek(lexer));
|
||
if (i < 0)
|
||
{
|
||
return -1;
|
||
}
|
||
next(lexer);
|
||
hex += i;
|
||
}
|
||
return hex;
|
||
}
|
||
|
||
static inline int64_t scan_utf8(Lexer *lexer, unsigned char c)
|
||
{
|
||
int utf8_bytes;
|
||
uint64_t result;
|
||
if (c < 0xc0) goto ERROR;
|
||
if (c <= 0xdf)
|
||
{
|
||
result = 0x1f & c;
|
||
utf8_bytes = 2;
|
||
}
|
||
else if (c <= 0xef)
|
||
{
|
||
result = 0xf & c;
|
||
utf8_bytes = 3;
|
||
}
|
||
else if (c <= 0xf7)
|
||
{
|
||
utf8_bytes = 4;
|
||
result = 0x7 & c;
|
||
}
|
||
else if (c <= 0xfb)
|
||
{
|
||
utf8_bytes = 5;
|
||
result = 0x3 & c;
|
||
}
|
||
else if (c <= 0xfd)
|
||
{
|
||
utf8_bytes = 6;
|
||
result = 0x1 & c;
|
||
}
|
||
else
|
||
{
|
||
goto ERROR;
|
||
}
|
||
for (int i = 1; i < utf8_bytes; i++)
|
||
{
|
||
result <<= 6U;
|
||
if (peek(lexer) == '\0') return 0xFFFD;
|
||
c = (unsigned char)next(lexer);
|
||
if ((c & 0xc0) != 0x80)
|
||
{
|
||
goto ERROR;
|
||
}
|
||
result += c & 0x3f;
|
||
}
|
||
return (int64_t)result;
|
||
ERROR:
|
||
add_error_token(lexer, "Invalid UTF-8 sequence.");
|
||
return -1;
|
||
}
|
||
|
||
/**
|
||
* Rules:
|
||
* 1. If ASCII or \xAB, accept up to 8 characters (16 with Int128 support), size is char, ushort, uint, ulong, UInt128
|
||
* 2. If UTF8, accept 1 UTF character, size is ushort normally, ulong on wide UTF characters, no additional characters accepted.
|
||
* 3. If \uABCD, convert to 16 bits, size is ushort, no additional characters accepted.
|
||
* 4. If \U01234567, convert to 32 bits, size is uint, no additional characters accepted.
|
||
*
|
||
* @param lexer
|
||
* @return
|
||
*/
|
||
static inline bool scan_char(Lexer *lexer)
|
||
{
|
||
|
||
// Handle the problem with zero size character literal first.
|
||
if (match(lexer, '\''))
|
||
{
|
||
return add_error_token(lexer, "The character literal was empty.");
|
||
}
|
||
|
||
int width = 0;
|
||
char c;
|
||
Int128 b = { 0, 0 };
|
||
|
||
while ((c = next(lexer)) != '\'')
|
||
{
|
||
// End of file may occur:
|
||
if (c == '\0')
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token(lexer, "The character literal did not terminate.");
|
||
}
|
||
// We might exceed the width that we allow.
|
||
if (width > 15) return add_error_token(lexer, "The character literal exceeds 16 characters.");
|
||
// Handle (expected) utf-8 characters.
|
||
if ((unsigned)c >= (unsigned)0x80)
|
||
{
|
||
if (width != 0) goto UNICODE_IN_MULTI;
|
||
const char *start = lexer->current;
|
||
int64_t utf8 = scan_utf8(lexer, (unsigned char)c);
|
||
if (utf8 < 0) return false;
|
||
if (!match(lexer, '\''))
|
||
{
|
||
if (peek(lexer) == '\0') continue;
|
||
lexer->lexing_start = start;
|
||
return add_error_token(lexer, "Unicode character literals may only contain one character, "
|
||
"please remove the additional ones or use all ASCII.");
|
||
}
|
||
b.low = (uint64_t) utf8;
|
||
width = utf8 > 0xffff ? 4 : 2;
|
||
goto DONE;
|
||
}
|
||
// Parse the escape code
|
||
signed char escape = ' ';
|
||
const char *start = lexer->current;
|
||
if (c == '\\')
|
||
{
|
||
assert(c == '\\');
|
||
c = next(lexer);
|
||
escape = is_valid_escape(c);
|
||
if (escape == -1)
|
||
{
|
||
backtrack(lexer);
|
||
lexer->lexing_start = start - 1;
|
||
if (c > ' ' && c <= 127)
|
||
{
|
||
return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
|
||
}
|
||
return add_error_token_at(lexer, start, 1, "An escape sequence was expected after '\\'.");
|
||
}
|
||
}
|
||
switch (escape)
|
||
{
|
||
case 'x':
|
||
{
|
||
int64_t hex = scan_hex_literal(lexer, 2);
|
||
if (hex < 0)
|
||
{
|
||
lexer->lexing_start = start - 1;
|
||
// Fix underlining if this is an unfinished escape.
|
||
return add_error_token(lexer, "Expected a two character hex value after \\x.");
|
||
}
|
||
// We can now reassign c and use the default code.
|
||
c = (char)hex;
|
||
break;
|
||
}
|
||
case 'u':
|
||
case 'U':
|
||
{
|
||
// First check that we don't have any characters previous to this one.
|
||
if (width != 0) goto UNICODE_IN_MULTI;
|
||
int bytes = escape == 'U' ? 4 : 2;
|
||
int64_t hex = scan_hex_literal(lexer, bytes * 2);
|
||
// The hex parsing may have failed, lacking more hex chars.
|
||
if (hex < 0)
|
||
{
|
||
lexer->lexing_start = start - 1;
|
||
return add_error_token(lexer, "Expected %s character hex value after \\%c.",
|
||
escape == 'u' ? "a four" : "an eight", escape);
|
||
}
|
||
// If we don't see the end here, then something is wrong.
|
||
if (!match(lexer, '\''))
|
||
{
|
||
// It may be the end of the line, if so use the default handling by invoking "continue"
|
||
if (peek(lexer) == '\0') continue;
|
||
// Otherwise step forward and mark it as an error.
|
||
next(lexer);
|
||
lexer->lexing_start = lexer->current - 1;
|
||
return add_error_token(lexer,
|
||
"Character literals with '\\%c' can only contain one character, please remove this one.",
|
||
escape);
|
||
}
|
||
// Assign the value and go to DONE.
|
||
b.low = (uint64_t) hex;
|
||
width = bytes;
|
||
goto DONE;
|
||
}
|
||
case ' ':
|
||
// No escape, a regular character.
|
||
break;
|
||
default:
|
||
c = (signed char)escape;
|
||
break;
|
||
}
|
||
// Default handling here:
|
||
width++;
|
||
b = i128_shl64(b, 8);
|
||
b = i128_add64(b, (unsigned char)c);
|
||
}
|
||
|
||
assert(width > 0 && width <= 16);
|
||
if (width > 8 && !platform_target.int128)
|
||
{
|
||
return add_error_token(lexer, "Character literal exceeded 8 characters.");
|
||
}
|
||
DONE:
|
||
add_generic_token(lexer, TOKEN_CHAR_LITERAL);
|
||
lexer->latest_token_data->char_value = b;
|
||
lexer->latest_token_data->width = (char)width;
|
||
return true;
|
||
|
||
UNICODE_IN_MULTI:
|
||
return add_error_token(lexer, "A multi-character literal may not contain unicode characters.");
|
||
}
|
||
|
||
static inline void skip_first_line_if_empty(Lexer *lexer)
|
||
{
|
||
// Start at the current token.
|
||
const char *current = lexer->current;
|
||
while (1)
|
||
{
|
||
switch (*(current++))
|
||
{
|
||
case '\n':
|
||
// Line end? then we jump to the first token after line end.
|
||
lexer->current = current - 1;
|
||
lexer_store_line_end(lexer);
|
||
lexer->current++;
|
||
return;
|
||
case ' ':
|
||
case '\t':
|
||
case '\f':
|
||
// Counts as whitespace.
|
||
break;
|
||
case '\r':
|
||
UNREACHABLE
|
||
default:
|
||
// Non whitespace -> no skip.
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
|
||
static int append_esc_string_token(char *restrict dest, const char *restrict src, size_t *pos)
|
||
{
|
||
int scanned;
|
||
uint64_t unicode_char;
|
||
signed char scanned_char = is_valid_escape(src[0]);
|
||
if (scanned_char < 0) return -1;
|
||
switch (scanned_char)
|
||
{
|
||
case 'x':
|
||
{
|
||
int h = char_to_nibble(src[1]);
|
||
if (h < 0) return -1;
|
||
int l = char_to_nibble(src[2]);
|
||
if (l < 0) return -1;
|
||
unicode_char = ((unsigned) h << 4U) + (unsigned)l;
|
||
scanned = 3;
|
||
break;
|
||
}
|
||
case 'u':
|
||
{
|
||
int x1 = char_to_nibble(src[1]);
|
||
if (x1 < 0) return -1;
|
||
int x2 = char_to_nibble(src[2]);
|
||
if (x2 < 0) return -1;
|
||
int x3 = char_to_nibble(src[3]);
|
||
if (x3 < 0) return -1;
|
||
int x4 = char_to_nibble(src[4]);
|
||
if (x4 < 0) return -1;
|
||
unicode_char = ((unsigned) x1 << 12U) + ((unsigned) x2 << 8U) + ((unsigned) x3 << 4U) + (unsigned)x4;
|
||
scanned = 5;
|
||
break;
|
||
}
|
||
case 'U':
|
||
{
|
||
int x1 = char_to_nibble(src[1]);
|
||
if (x1 < 0) return -1;
|
||
int x2 = char_to_nibble(src[2]);
|
||
if (x2 < 0) return -1;
|
||
int x3 = char_to_nibble(src[3]);
|
||
if (x3 < 0) return -1;
|
||
int x4 = char_to_nibble(src[4]);
|
||
if (x4 < 0) return -1;
|
||
int x5 = char_to_nibble(src[5]);
|
||
if (x5 < 0) return -1;
|
||
int x6 = char_to_nibble(src[6]);
|
||
if (x6 < 0) return -1;
|
||
int x7 = char_to_nibble(src[7]);
|
||
if (x7 < 0) return -1;
|
||
int x8 = char_to_nibble(src[8]);
|
||
if (x8 < 0) return -1;
|
||
unicode_char = ((unsigned) x1 << 28U) + ((unsigned) x2 << 24U) + ((unsigned) x3 << 20U) + ((unsigned) x4 << 16U) +
|
||
((unsigned) x5 << 12U) + ((unsigned) x6 << 8U) + ((unsigned) x7 << 4U) + (unsigned)x8;
|
||
scanned = 9;
|
||
break;
|
||
}
|
||
default:
|
||
dest[(*pos)++] = scanned_char;
|
||
return 1;
|
||
}
|
||
if (unicode_char < 0x80U)
|
||
{
|
||
dest[(*pos)++] = (char)unicode_char;
|
||
}
|
||
else if (unicode_char < 0x800U)
|
||
{
|
||
dest[(*pos)++] = (char)(0xC0U | (unicode_char >> 6U));
|
||
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
|
||
}
|
||
else if (unicode_char < 0x10000U)
|
||
{
|
||
dest[(*pos)++] = (char)(0xE0U | (unicode_char >> 12U));
|
||
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 6U) & 0x3FU));
|
||
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
|
||
}
|
||
else
|
||
{
|
||
dest[(*pos)++] = (char)(0xF0U | (unicode_char >> 18U));
|
||
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 12U) & 0x3FU));
|
||
dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 6U) & 0x3FU));
|
||
dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
|
||
}
|
||
return scanned;
|
||
}
|
||
|
||
|
||
static inline size_t scan_multiline_indent(const char *current, const char **end_ref, int32_t *min_indent_ref)
|
||
{
|
||
// 3. Initial scan.
|
||
char c;
|
||
bool multi_line = false;
|
||
int32_t current_indent = 0;
|
||
int32_t min_indent = INT32_MAX;
|
||
size_t len = 0;
|
||
while ((c = (current++)[0]) != '\0')
|
||
{
|
||
if (c == '"' && current[0] == '"' && current[1] == '"') break;
|
||
// 1. If we've only seen whitespace so far
|
||
if (current_indent >= 0)
|
||
{
|
||
// 2. More whitespace, so increase indent
|
||
if (is_whitespace(c))
|
||
{
|
||
if (c == ' ' || c == '\t') current_indent++;
|
||
}
|
||
else
|
||
{
|
||
// 3. Otherwise, update if smaller before
|
||
if (current_indent < min_indent) min_indent = current_indent;
|
||
// 4. And disable further tracking.
|
||
current_indent = -1;
|
||
}
|
||
// 5. Just continue if escape, this makes
|
||
// escape automatically track as non-whitespace
|
||
if (c == '\\') continue;
|
||
}
|
||
// 6. On new line, set multi_line to true and reset indent.
|
||
if (c == '\n')
|
||
{
|
||
multi_line = true;
|
||
current_indent = 0;
|
||
}
|
||
// 7. Increase our conservative estimate of the length
|
||
// which does not properly take into account indent
|
||
// and escapes.
|
||
len++;
|
||
}
|
||
|
||
// 8. If we ended on EOF
|
||
if (c == '\0')
|
||
{
|
||
current--;
|
||
*end_ref = current;
|
||
*min_indent_ref = 0;
|
||
return len;
|
||
}
|
||
// 8. We're stopping at the second '"' so we need to back up 1
|
||
current -= 1;
|
||
|
||
// 10. We have four cases:
|
||
// a. Single row -> no action
|
||
// b. Characters on same line before ending chars -> no action
|
||
// c. No space or characters before the ending chars
|
||
// d. Space before the ending chars
|
||
|
||
// 14. This will handle c & d
|
||
if (multi_line && current_indent >= 0)
|
||
{
|
||
// Just walk back until '\n' is found.
|
||
while (current[0] != '\n') current--;
|
||
}
|
||
|
||
*end_ref = current;
|
||
*min_indent_ref = min_indent == INT32_MAX ? 0 : min_indent;
|
||
return len;
|
||
}
|
||
|
||
bool scan_consume_end_of_multiline(Lexer *lexer, bool error_on_eof)
|
||
{
|
||
int consume_end = 3;
|
||
while (consume_end > 0)
|
||
{
|
||
char c = next(lexer);
|
||
if (c == '\0')
|
||
{
|
||
backtrack(lexer);
|
||
if (!error_on_eof) return false;
|
||
return add_error_token_at(lexer, lexer->current - 1, 1, "The multi-line string unexpectedly ended. "
|
||
"Did you forget a '\"\"\"' somewhere?");
|
||
}
|
||
if (c == '"') consume_end--;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Scan a multi-line string between """ ... """
|
||
* - Remove initial newline & space on the first """
|
||
* if the text does not start on the first row.
|
||
* - Remove space before the last """ if the text
|
||
* does not end on the last row.
|
||
* - Remove last trailing \n
|
||
* - Skip \r
|
||
*
|
||
* @param lexer
|
||
* @return
|
||
*/
|
||
static inline bool scan_multiline_string(Lexer *lexer)
|
||
{
|
||
// 1. Step past '""'
|
||
next(lexer);
|
||
next(lexer);
|
||
|
||
// 2. See if the first line only has space and line end.
|
||
skip_first_line_if_empty(lexer);
|
||
|
||
// 3. Perform a scan to determine actual start and end of what we want
|
||
// to parse
|
||
const char *end;
|
||
int32_t min_indent;
|
||
size_t len = scan_multiline_indent(lexer->current, &end, &min_indent);
|
||
|
||
// Allocate result
|
||
char *destination = malloc_arena(len + 1);
|
||
|
||
int line = 0;
|
||
char c;
|
||
len = 0;
|
||
while (lexer->current < end)
|
||
{
|
||
c = peek(lexer);
|
||
|
||
// Ok, we reached the end of line
|
||
// update the line end and store it in the resulting buffer.
|
||
if (c == '\n')
|
||
{
|
||
lexer_store_line_end(lexer);
|
||
next(lexer);
|
||
destination[len++] = c;
|
||
line = 0;
|
||
continue;
|
||
}
|
||
|
||
// By now it's safe to advance one step.
|
||
next(lexer);
|
||
line++;
|
||
|
||
// We reached EOF, or escape + end of file.
|
||
if (c == '\0' || (c == '\\' && peek(lexer) == '\0'))
|
||
{
|
||
return add_error_token_at(lexer, lexer->current - 1, 1, "The multi-line string unexpectedly ended. "
|
||
"Did you forget a '\"\"\"' somewhere?");
|
||
}
|
||
|
||
// An escape sequence was reached.
|
||
if (c == '\\')
|
||
{
|
||
// Handle the empty escape: we simply skip.
|
||
if (peek(lexer) == '|')
|
||
{
|
||
next(lexer);
|
||
continue;
|
||
}
|
||
int scanned = append_esc_string_token(destination, lexer->current, &len);
|
||
if (scanned < 0)
|
||
{
|
||
add_error_token_at(lexer, lexer->current - 1, 2, "Invalid escape in string.");
|
||
scan_consume_end_of_multiline(lexer, false);
|
||
return false;
|
||
}
|
||
lexer->current += scanned;
|
||
continue;
|
||
}
|
||
// Now first we skip any empty space if line has not been reached.
|
||
if (line <= min_indent)
|
||
{
|
||
assert(is_whitespace(c));
|
||
continue;
|
||
}
|
||
destination[len++] = c;
|
||
}
|
||
if (!scan_consume_end_of_multiline(lexer, true)) return false;
|
||
destination[len] = 0;
|
||
add_token(lexer, TOKEN_STRING, destination);
|
||
lexer->latest_token_data->strlen = len;
|
||
return true;
|
||
}
|
||
|
||
static inline void consume_to_end_quote(Lexer *lexer)
|
||
{
|
||
char c;
|
||
while ((c = peek(lexer)) != '\0' && c != '"')
|
||
{
|
||
if (c == '\n')
|
||
{
|
||
lexer_store_line_end(lexer);
|
||
}
|
||
next(lexer);
|
||
}
|
||
}
|
||
|
||
static inline bool scan_string(Lexer *lexer)
|
||
{
|
||
if (peek(lexer) == '"' && peek_next(lexer) == '"')
|
||
{
|
||
return scan_multiline_string(lexer);
|
||
}
|
||
char c = 0;
|
||
const char *current = lexer->current;
|
||
while ((c = *(current++)) != '"')
|
||
{
|
||
if (c == '\n' || c == '\0')
|
||
{
|
||
current++;
|
||
break;
|
||
}
|
||
if (c == '\\' && *current == '"')
|
||
{
|
||
current++;
|
||
continue;
|
||
}
|
||
}
|
||
const char *end = current - 1;
|
||
char *destination = malloc_arena((size_t)(end - lexer->current + 1));
|
||
size_t len = 0;
|
||
while (lexer->current < end)
|
||
{
|
||
c = next(lexer);
|
||
if (c == '\0' || (c == '\\' && peek(lexer) == '\0'))
|
||
{
|
||
if (c == '\0') backtrack(lexer);
|
||
add_error_token_at(lexer, lexer->current - 1, 1, "The end of the file was reached "
|
||
"while parsing the string. "
|
||
"Did you forget (or accidentally add) a '\"' somewhere?");
|
||
consume_to_end_quote(lexer);
|
||
return false;
|
||
}
|
||
if (c == '\n' || (c == '\\' && peek(lexer) == '\n'))
|
||
{
|
||
add_error_token_at(lexer, lexer->current - 1, 1, "The end of the line was reached "
|
||
"while parsing the string. "
|
||
"Did you forget (or accidentally add) a '\"' somewhere?");
|
||
lexer->current--;
|
||
consume_to_end_quote(lexer);
|
||
return false;
|
||
}
|
||
if (c == '\\')
|
||
{
|
||
int scanned = append_esc_string_token(destination, lexer->current, &len);
|
||
if (scanned < 0)
|
||
{
|
||
add_error_token_at(lexer, lexer->current - 1, 2, "Invalid escape in string.");
|
||
consume_to_end_quote(lexer);
|
||
return false;
|
||
}
|
||
lexer->current += scanned;
|
||
continue;
|
||
}
|
||
destination[len++] = c;
|
||
}
|
||
// Skip the `"`
|
||
next(lexer);
|
||
destination[len] = 0;
|
||
add_token(lexer, TOKEN_STRING, destination);
|
||
lexer->latest_token_data->strlen = len;
|
||
return true;
|
||
}
|
||
|
||
static inline bool scan_raw_string(Lexer *lexer)
|
||
{
|
||
char c;
|
||
while ((c = next(lexer)) != '`' || peek(lexer) == '`')
|
||
{
|
||
if (c == '\0')
|
||
{
|
||
backtrack(lexer);
|
||
return add_error_token_at(lexer, lexer->lexing_start , 1, "Reached the end of the file looking for "
|
||
"the end of the raw string that starts "
|
||
"here. Did you forget a '`' somewhere?");
|
||
}
|
||
if (c == '`') next(lexer);
|
||
}
|
||
const char *current = lexer->lexing_start + 1;
|
||
const char *end = lexer->current - 1;
|
||
size_t len = (size_t)(end - current);
|
||
char *destination = malloc_arena(len + 1);
|
||
len = 0;
|
||
while (current < end)
|
||
{
|
||
c = *(current++);
|
||
if (c == '`' && current[0] == '`')
|
||
{
|
||
current++;
|
||
}
|
||
destination[len++] = c;
|
||
}
|
||
destination[len] = 0;
|
||
add_token(lexer, TOKEN_STRING, destination);
|
||
lexer->latest_token_data->strlen = len;
|
||
return true;
|
||
}
|
||
|
||
static inline bool scan_hex_array(Lexer *lexer)
|
||
{
|
||
char start_char = next(lexer); // Step past ' or "
|
||
const char *hexdata = lexer->current;
|
||
char c;
|
||
uint64_t len = 0;
|
||
while (1)
|
||
{
|
||
c = next(lexer);
|
||
if (c == start_char) break;
|
||
if (c == 0)
|
||
{
|
||
backtrack(lexer);
|
||
lexer->lexing_start = lexer->current - 1;
|
||
return add_error_token(lexer, "The hex string seems to be missing a terminating '%c'", start_char);
|
||
}
|
||
if (is_hex(c))
|
||
{
|
||
len++;
|
||
continue;
|
||
}
|
||
if (!is_whitespace(c))
|
||
{
|
||
lexer->lexing_start = hexdata - 1;
|
||
lexer->current = hexdata;
|
||
return add_error_token(lexer,
|
||
"'%c' isn't a valid hexadecimal digit, all digits should be a-z, A-Z and 0-9.",
|
||
c);
|
||
}
|
||
}
|
||
if (len % 2)
|
||
{
|
||
return add_error_token(lexer, "The hexadecimal string is not an even length, did you miss a digit somewhere?");
|
||
}
|
||
if (!add_token(lexer, TOKEN_BYTES, lexer->lexing_start)) return false;
|
||
lexer->latest_token_data->is_base64 = false;
|
||
lexer->latest_token_data->len = (uint64_t)len / 2;
|
||
return true;
|
||
}
|
||
|
||
static inline bool scan_base64(Lexer *lexer)
|
||
{
|
||
next(lexer); // Step past 6
|
||
next(lexer); // Step past 4
|
||
char start_char = next(lexer); // Step past ' or "
|
||
const char *b64data = lexer->current;
|
||
char c;
|
||
unsigned end_len = 0;
|
||
uint64_t len = 0;
|
||
while (1)
|
||
{
|
||
c = next(lexer);
|
||
if (c == start_char) break;
|
||
if (c == 0)
|
||
{
|
||
backtrack(lexer);
|
||
lexer->lexing_start = lexer->current - 1;
|
||
return add_error_token(lexer, "The base64 string seems to be missing a terminating '%c'", start_char);
|
||
}
|
||
if (is_base64(c))
|
||
{
|
||
if (end_len)
|
||
{
|
||
lexer->lexing_start = lexer->current - 1;
|
||
return add_error_token(lexer, "'%c' can't be placed after an ending '='", c);
|
||
}
|
||
len++;
|
||
continue;
|
||
}
|
||
if (c == '=')
|
||
{
|
||
if (end_len > 3)
|
||
{
|
||
lexer->lexing_start = b64data - 1;
|
||
lexer->current = b64data;
|
||
return add_error_token(lexer, "There cannot be more than 3 '=' at the end of a base64 string.", c);
|
||
}
|
||
end_len++;
|
||
continue;
|
||
}
|
||
if (!is_whitespace(c))
|
||
{
|
||
lexer->lexing_start = b64data - 1;
|
||
lexer->current = b64data;
|
||
return add_error_token(lexer, "'%c' is not a valid base64 character.", c);
|
||
}
|
||
}
|
||
uint64_t decoded_len = (3 * len - end_len) / 4;
|
||
if (!add_token(lexer, TOKEN_BYTES, lexer->lexing_start)) return false;
|
||
lexer->latest_token_data->is_base64 = true;
|
||
lexer->latest_token_data->len = decoded_len;
|
||
return true;
|
||
}
|
||
|
||
|
||
|
||
// --- Lexer doc lexing
|
||
|
||
/**
|
||
* Skip any stars until we either have no more * or we find '* /'
|
||
* @param lexer
|
||
*/
|
||
static void skip_doc_stars(Lexer *lexer)
|
||
{
|
||
while (peek(lexer) == '*' && peek_next(lexer) != '/') next(lexer);
|
||
}
|
||
|
||
static bool end_of_docs_found(Lexer *lexer)
|
||
{
|
||
int lookahead = 0;
|
||
// while we see '*' walk forward.
|
||
while (lexer->current[lookahead] == '*') lookahead++;
|
||
// And if it doesn't have a '/' at the last position it isn't either.
|
||
return lexer->current[lookahead] == '/';
|
||
}
|
||
/**
|
||
* OPTIONALLY adds * / token. This allows any number of '*' to preceed it.
|
||
* @param lexer
|
||
* @return
|
||
*/
|
||
static bool parse_add_end_of_docs_if_present(Lexer *lexer)
|
||
{
|
||
int lookahead = 0;
|
||
// while we see '*' walk forward.
|
||
while (lexer->current[lookahead] == '*') lookahead++;
|
||
// if we didn't see a '*' to begin with, then it's not an end
|
||
if (lookahead < 1) return false;
|
||
// And if it doesn't have a '/' at the last position it isn't either.
|
||
if (lexer->current[lookahead] != '/') return false;
|
||
// Otherwise, gladly skip ahead and store the end.
|
||
skip(lexer, lookahead + 1);
|
||
add_token(lexer, TOKEN_DOCS_END, lexer->lexing_start);
|
||
lexer->lexing_start = lexer->current;
|
||
return true;
|
||
}
|
||
|
||
|
||
static void parse_add_end_of_doc_line(Lexer *lexer)
|
||
{
|
||
assert(peek(lexer) == '\n');
|
||
// Add the EOL token.
|
||
lexer_store_line_end(lexer);
|
||
next(lexer);
|
||
add_token(lexer, TOKEN_DOCS_EOL, lexer->lexing_start);
|
||
lexer->lexing_start = lexer->current;
|
||
// Skip whitespace
|
||
skip_whitespace(lexer, LEX_DOCS);
|
||
// And any leading stars:
|
||
skip_doc_stars(lexer);
|
||
}
|
||
|
||
/**
|
||
* Parse the end of a directive or a simple line, e.g.
|
||
* For "* @param lexer The lexer used." then the remainder is "The lexer used."
|
||
* For "*** Hello world" the remainder is "Hello world"
|
||
*/
|
||
static DocEnd parse_doc_remainder(Lexer *lexer)
|
||
{
|
||
// Skip all initial whitespace.
|
||
skip_whitespace(lexer, LEX_DOCS);
|
||
lexer->lexing_start = lexer->current;
|
||
|
||
int characters_read = 0;
|
||
while (1)
|
||
{
|
||
switch (peek(lexer))
|
||
{
|
||
case '*':
|
||
// Did we find the end of the directives?
|
||
// If so return control.
|
||
if (!end_of_docs_found(lexer)) break;
|
||
|
||
if (characters_read > 0)
|
||
{
|
||
add_token(lexer, TOKEN_DOCS_LINE, 0);
|
||
lexer->lexing_start = lexer->current;
|
||
}
|
||
if (parse_add_end_of_docs_if_present(lexer)) return DOC_END_LAST;
|
||
// Otherwise use default parsing.
|
||
break;
|
||
case '\n':
|
||
// End of line
|
||
if (characters_read > 0)
|
||
{
|
||
add_token(lexer, TOKEN_DOCS_LINE, 0);
|
||
lexer->lexing_start = lexer->current;
|
||
}
|
||
return DOC_END_EOL;
|
||
case '\0':
|
||
if (characters_read > 0)
|
||
{
|
||
add_token(lexer, TOKEN_DOCS_LINE, 0);
|
||
lexer->lexing_start = lexer->current;
|
||
}
|
||
return DOC_END_EOF;
|
||
default:
|
||
break;
|
||
}
|
||
// Otherwise move forward
|
||
characters_read++;
|
||
next(lexer);
|
||
}
|
||
}
|
||
|
||
static DocEnd parse_doc_error_directive(Lexer *lexer)
|
||
{
|
||
while (1)
|
||
{
|
||
// Skip any whitespace.
|
||
skip_whitespace(lexer, LEX_DOCS);
|
||
|
||
// First scan the name
|
||
if (!lexer_scan_token_inner(lexer, LEX_DOCS)) return DOC_END_ERROR;
|
||
|
||
if (*lexer->latest_token_type != TOKEN_TYPE_IDENT) break;
|
||
|
||
// Skip any whitespace.
|
||
skip_whitespace(lexer, LEX_DOCS);
|
||
|
||
// If we don't reach "|" we exit, since errors are composed using ErrorA | ErrorB
|
||
if (peek(lexer) != '|') break;
|
||
|
||
if (!lexer_scan_token_inner(lexer, LEX_DOCS)) return DOC_END_ERROR;
|
||
|
||
// We might get "|=" or something, in that case exit.
|
||
if (*lexer->latest_token_type != TOKEN_BIT_OR) break;
|
||
}
|
||
return parse_doc_remainder(lexer);
|
||
}
|
||
|
||
/**
|
||
* Contract directives use the style: "@require a > 2, b && c == true : "Must work foo"
|
||
*
|
||
* @param lexer
|
||
* @return
|
||
*/
|
||
static DocEnd parse_doc_contract_directive(Lexer *lexer)
|
||
{
|
||
while (1)
|
||
{
|
||
// Skip all initial whitespace.
|
||
skip_whitespace(lexer, LEX_DOCS);
|
||
|
||
switch (peek(lexer))
|
||
{
|
||
case '*':
|
||
// Did we find the end of the directives?
|
||
// If so return control.
|
||
if (parse_add_end_of_docs_if_present(lexer)) return DOC_END_LAST;
|
||
// Otherwise use default parsing.
|
||
break;
|
||
case '\n':
|
||
return DOC_END_EOL;
|
||
case '\0':
|
||
return DOC_END_EOF;
|
||
default:
|
||
break;
|
||
}
|
||
// Otherwise move forward
|
||
if (!lexer_scan_token_inner(lexer, LEX_DOCS)) return DOC_END_ERROR;
|
||
|
||
// "return" is an identifier inside.
|
||
if (*lexer->latest_token_type == TOKEN_RETURN)
|
||
{
|
||
*lexer->latest_token_type = TOKEN_IDENT;
|
||
}
|
||
}
|
||
}
|
||
|
||
static DocEnd parse_doc_param_directive(Lexer *lexer)
|
||
{
|
||
// Skip any whitespace.
|
||
skip_whitespace(lexer, LEX_DOCS);
|
||
|
||
// First scan the name
|
||
if (!lexer_scan_token_inner(lexer, LEX_DOCS)) return DOC_END_ERROR;
|
||
|
||
// Then the remainder
|
||
return parse_doc_remainder(lexer);
|
||
}
|
||
|
||
static DocEnd parse_doc_directive(Lexer *lexer)
|
||
{
|
||
// We expect a directive here.
|
||
if (!is_letter(peek_next(lexer)))
|
||
{
|
||
return add_error_token(lexer, "Expected doc directive here.");
|
||
}
|
||
lexer->lexing_start = lexer->current;
|
||
// First parse the '@'
|
||
skip(lexer, 1);
|
||
add_token(lexer, TOKEN_DOCS_DIRECTIVE, "@");
|
||
lexer->lexing_start = lexer->current;
|
||
|
||
// Then our keyword
|
||
if (!scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST, TOKEN_TYPE_IDENT, 0)) return DOC_END_ERROR;
|
||
|
||
assert(*lexer->latest_token_type == TOKEN_IDENT || *lexer->latest_token_type == TOKEN_RETURN);
|
||
|
||
const char *last_token_string = lexer->latest_token_data->string;
|
||
|
||
if (*lexer->latest_token_type == TOKEN_RETURN)
|
||
{
|
||
// Backpatch the type.
|
||
*lexer->latest_token_type = TOKEN_IDENT;
|
||
return parse_doc_remainder(lexer);
|
||
}
|
||
if (kw_errors == last_token_string)
|
||
{
|
||
return parse_doc_error_directive(lexer);
|
||
}
|
||
if (last_token_string == kw_require || last_token_string == kw_ensure || last_token_string == kw_reqparse)
|
||
{
|
||
return parse_doc_contract_directive(lexer);
|
||
}
|
||
if (last_token_string == kw_param)
|
||
{
|
||
// The variable
|
||
return parse_doc_param_directive(lexer);
|
||
}
|
||
return parse_doc_remainder(lexer);
|
||
}
|
||
|
||
/**
|
||
* Parse the / ** * / directives comments
|
||
**/
|
||
static bool parse_doc_comment(Lexer *lexer)
|
||
{
|
||
// Add the doc start token.
|
||
add_token(lexer, TOKEN_DOCS_START, lexer->lexing_start);
|
||
|
||
// Skip any additional stars
|
||
skip_doc_stars(lexer);
|
||
|
||
// Main "doc parse" loop.
|
||
while (1)
|
||
{
|
||
// 1. Skip any whitespace
|
||
skip_whitespace(lexer, LEX_DOCS);
|
||
|
||
// 2. Did we find the end?
|
||
if (reached_end(lexer)) return add_error_token(lexer, "Missing '*/' to end the doc comment.");
|
||
|
||
// 3. See if we reach the end of the docs.
|
||
if (parse_add_end_of_docs_if_present(lexer)) return true;
|
||
|
||
DocEnd end;
|
||
// Parse a segment
|
||
switch (peek(lexer))
|
||
{
|
||
case '@':
|
||
end = parse_doc_directive(lexer);
|
||
break;
|
||
case '\n':
|
||
end = DOC_END_EOL;
|
||
break;
|
||
default:
|
||
end = parse_doc_remainder(lexer);
|
||
break;
|
||
}
|
||
|
||
// We're done parsing a line:
|
||
switch (end)
|
||
{
|
||
case DOC_END_ERROR:
|
||
return false;
|
||
case DOC_END_EOF:
|
||
// Just continue, this will be picked up in the beginning of the loop.
|
||
break;
|
||
case DOC_END_LAST:
|
||
// We're done, so return.
|
||
return true;
|
||
case DOC_END_EOL:
|
||
// Walk past the end of line.
|
||
parse_add_end_of_doc_line(lexer);
|
||
break;
|
||
default:
|
||
UNREACHABLE
|
||
}
|
||
}
|
||
}
|
||
|
||
// --- Lexer public functions
|
||
|
||
|
||
Token lexer_advance(Lexer *lexer)
|
||
{
|
||
Token token = { .id.index = lexer->lexer_index, .type = (TokenType)(*toktypeptr(lexer->lexer_index)) };
|
||
lexer->lexer_index++;
|
||
return token;
|
||
}
|
||
|
||
|
||
static bool lexer_scan_token_inner(Lexer *lexer, LexMode mode)
|
||
{
|
||
// Now skip the whitespace.
|
||
skip_whitespace(lexer, mode);
|
||
|
||
// Point start to the first non-whitespace character.
|
||
lexer->lexing_start = lexer->current;
|
||
|
||
if (reached_end(lexer))
|
||
{
|
||
assert(mode == LEX_NORMAL);
|
||
return add_token(lexer, TOKEN_EOF, "\n") && false;
|
||
}
|
||
|
||
char c = next(lexer);
|
||
switch (c)
|
||
{
|
||
case '@':
|
||
return add_token(lexer, TOKEN_AT, "@");
|
||
case '\'':
|
||
return scan_char(lexer);
|
||
case '`':
|
||
return scan_raw_string(lexer);
|
||
case '"':
|
||
return scan_string(lexer);
|
||
case '#':
|
||
return scan_ident(lexer, TOKEN_HASH_IDENT, TOKEN_HASH_CONST_IDENT, TOKEN_HASH_TYPE_IDENT, '$');
|
||
case '$':
|
||
if (match(lexer, '{')) return add_token(lexer, TOKEN_PLACEHOLDER, "${");
|
||
if (match(lexer, '$'))
|
||
{
|
||
if (is_letter(peek(lexer)))
|
||
{
|
||
add_token(lexer, TOKEN_BUILTIN, "$$");
|
||
lexer->lexing_start = lexer->current;
|
||
return scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
|
||
}
|
||
return add_error_token(lexer, "Expected a letter after $$.");
|
||
}
|
||
return scan_ident(lexer, TOKEN_CT_IDENT, TOKEN_CT_CONST_IDENT, TOKEN_CT_TYPE_IDENT, '$');
|
||
case ',':
|
||
return add_token(lexer, TOKEN_COMMA, ",");
|
||
case ';':
|
||
return add_token(lexer, TOKEN_EOS, ";");
|
||
case '{':
|
||
return match(lexer, '|') ? add_token(lexer, TOKEN_LBRAPIPE, "{|") : add_token(lexer, TOKEN_LBRACE, "{");
|
||
case '}':
|
||
return add_token(lexer, TOKEN_RBRACE, "}");
|
||
case '(':
|
||
return add_token(lexer, TOKEN_LPAREN, "(");
|
||
case ')':
|
||
return add_token(lexer, TOKEN_RPAREN, ")");
|
||
case '[':
|
||
if (match(lexer, '<')) return add_token(lexer, TOKEN_LVEC, "[<");
|
||
return add_token(lexer, TOKEN_LBRACKET, "[");
|
||
case ']':
|
||
return add_token(lexer, TOKEN_RBRACKET, "]");
|
||
case '.':
|
||
if (match(lexer, '.'))
|
||
{
|
||
if (match(lexer, '.')) return add_token(lexer, TOKEN_ELLIPSIS, "...");
|
||
return add_token(lexer, TOKEN_DOTDOT, "..");
|
||
}
|
||
return add_token(lexer, TOKEN_DOT, ".");
|
||
case '~':
|
||
return add_token(lexer, TOKEN_BIT_NOT, "~");
|
||
case ':':
|
||
return match(lexer, ':') ? add_token(lexer, TOKEN_SCOPE, "::") : add_token(lexer, TOKEN_COLON, ":");
|
||
case '!':
|
||
if (match(lexer, '!')) return add_token(lexer, TOKEN_BANGBANG, "!!");
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_NOT_EQUAL, "!=") : add_token(lexer, TOKEN_BANG, "!");
|
||
case '/':
|
||
// We can't get any directives comments here.
|
||
if (mode != LEX_DOCS)
|
||
{
|
||
if (match(lexer, '/')) return parse_line_comment(lexer);
|
||
if (match(lexer, '*')) return match(lexer, '*') ? parse_doc_comment(lexer) : parse_multiline_comment(lexer);
|
||
}
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_DIV_ASSIGN, "/=") : add_token(lexer, TOKEN_DIV, "/");
|
||
case '*':
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_MULT_ASSIGN, "*=") : add_token(lexer, TOKEN_STAR, "*");
|
||
case '=':
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_EQEQ, "==") : add_token(lexer, TOKEN_EQ, "=");
|
||
case '^':
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_BIT_XOR_ASSIGN, "^=") : add_token(lexer,
|
||
TOKEN_BIT_XOR,
|
||
"^");
|
||
case '?':
|
||
if (match(lexer, '?')) return add_token(lexer, TOKEN_QUESTQUEST, "??");
|
||
return match(lexer, ':') ? add_token(lexer, TOKEN_ELVIS, "?:") : add_token(lexer, TOKEN_QUESTION, "?");
|
||
case '<':
|
||
if (match(lexer, '<'))
|
||
{
|
||
if (match(lexer, '=')) return add_token(lexer, TOKEN_SHL_ASSIGN, "<<=");
|
||
return add_token(lexer, TOKEN_SHL, "<<");
|
||
}
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_LESS_EQ, "<=") : add_token(lexer, TOKEN_LESS, "<");
|
||
case '>':
|
||
if (match(lexer, '>'))
|
||
{
|
||
if (match(lexer, '=')) return add_token(lexer, TOKEN_SHR_ASSIGN, ">>=");
|
||
return add_token(lexer, TOKEN_SHR, ">>");
|
||
}
|
||
if (match(lexer, ']')) return add_token(lexer, TOKEN_RVEC, ">]");
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_GREATER_EQ, ">=") : add_token(lexer, TOKEN_GREATER, ">");
|
||
case '%':
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_MOD_ASSIGN, "%=") : add_token(lexer, TOKEN_MOD, "%");
|
||
case '&':
|
||
if (match(lexer, '&')) return add_token(lexer, TOKEN_AND, "&&");
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_BIT_AND_ASSIGN, "&=") : add_token(lexer, TOKEN_AMP, "&");
|
||
case '|':
|
||
if (match(lexer, '}')) return add_token(lexer, TOKEN_RBRAPIPE, "|}");
|
||
if (match(lexer, '|')) return add_token(lexer, TOKEN_OR, "||");
|
||
return match(lexer, '=') ? add_token(lexer, TOKEN_BIT_OR_ASSIGN, "|=") : add_token(lexer,
|
||
TOKEN_BIT_OR,
|
||
"|");
|
||
case '+':
|
||
if (match(lexer, '+')) return add_token(lexer, TOKEN_PLUSPLUS, "++");
|
||
if (match(lexer, '=')) return add_token(lexer, TOKEN_PLUS_ASSIGN, "+=");
|
||
return add_token(lexer, TOKEN_PLUS, "+");
|
||
case '-':
|
||
if (match(lexer, '>')) return add_token(lexer, TOKEN_ARROW, "->");
|
||
if (match(lexer, '-')) return add_token(lexer, TOKEN_MINUSMINUS, "--");
|
||
if (match(lexer, '=')) return add_token(lexer, TOKEN_MINUS_ASSIGN, "-=");
|
||
return add_token(lexer, TOKEN_MINUS, "-");
|
||
case 'b':
|
||
if (peek(lexer) == '6' && peek_next(lexer) == '4' && (lexer->current[2] == '\'' || lexer->current[2] == '"'))
|
||
{
|
||
return scan_base64(lexer);
|
||
}
|
||
FALLTHROUGH;
|
||
default:
|
||
if (c == 'x' && (peek(lexer) == '"' || peek(lexer) == '\''))
|
||
{
|
||
return scan_hex_array(lexer);
|
||
}
|
||
if (is_alphanum_(c))
|
||
{
|
||
backtrack(lexer);
|
||
return is_digit(c) ? scan_digit(lexer) : scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
|
||
}
|
||
if (c < 0)
|
||
{
|
||
return add_error_token(lexer, "The 0%x character may not be placed outside of a string or comment, did you perhaps forget a \" somewhere?", (uint8_t)c);
|
||
}
|
||
return add_error_token(lexer, "'%c' may not be placed outside of a string or comment, did you perhaps forget a \" somewhere?", c);
|
||
|
||
}
|
||
}
|
||
|
||
File* lexer_current_file(Lexer *lexer)
|
||
{
|
||
return lexer->current_file;
|
||
}
|
||
|
||
#define tokenid(_ptr) ((unsigned)((TokenOld *)(_ptr) - ((TokenOld *)lexer->memory.ptr)))
|
||
|
||
|
||
|
||
void lexer_init_with_file(Lexer *lexer, File *file)
|
||
{
|
||
file->token_start_id = (uint32_t) toktype_arena.allocated;
|
||
lexer->current_file = file;
|
||
lexer->file_begin = lexer->current_file->contents;
|
||
lexer->lexing_start = lexer->file_begin;
|
||
lexer->current = lexer->lexing_start;
|
||
lexer->current_line = 1;
|
||
lexer->line_start = lexer->current;
|
||
lexer->lexer_index = file->token_start_id;
|
||
const unsigned char *check = (const unsigned char *)lexer->current;
|
||
unsigned c;
|
||
int balance = 0;
|
||
while ((c = *(check++)) != '\0')
|
||
{
|
||
if (c != 0xE2) continue;
|
||
unsigned char type = check[1];
|
||
switch (check[0])
|
||
{
|
||
case 0x80:
|
||
if (type == 0xAC)
|
||
{
|
||
balance--;
|
||
if (balance < 0) goto DONE;
|
||
}
|
||
if (type >= 0xAA && type <= 0xAE)
|
||
{
|
||
balance++;
|
||
}
|
||
break;
|
||
case 0x81:
|
||
if (type >= 0xA6 && type <= 0xA8)
|
||
{
|
||
balance++;
|
||
}
|
||
else if (type == 0xA9)
|
||
{
|
||
balance--;
|
||
if (balance < 0) goto DONE;
|
||
}
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
}
|
||
DONE:
|
||
if (balance != 0)
|
||
{
|
||
add_error_token(lexer, "Invalid encoding - Unbalanced bidirectional markers.");
|
||
return;
|
||
}
|
||
while(1)
|
||
{
|
||
if (!lexer_scan_token_inner(lexer, LEX_NORMAL))
|
||
{
|
||
if (reached_end(lexer)) break;
|
||
while (!reached_end(lexer) && peek(lexer) != '\n') next(lexer);
|
||
lexer->lexing_start = lexer->current;
|
||
continue;
|
||
}
|
||
}
|
||
|
||
}
|
||
|