c3c/src/compiler/lexer.c

// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
// Use of this source code is governed by the GNU LGPLv3.0 license
// a copy of which can be found in the LICENSE file.

#include "compiler_internal.h"

typedef enum
{
	LEX_NORMAL,
	LEX_DOCS,
} LexMode;

typedef enum
{
	DOC_END_EOF,
	DOC_END_LAST,
	DOC_END_EOL,
	DOC_END_ERROR,
} DocEnd;

// --- Lexing general methods.

static bool lexer_scan_token_inner(Lexer *lexer, LexMode mode);

// Peek at the current character in the buffer.
static inline char peek(Lexer *lexer)
{
	return *lexer->current;
}

// Look at the prev character in the buffer.
static inline char prev(Lexer *lexer)
{
	return lexer->current[-1];
}

// Backtrack the buffer read one step.
static inline void backtrack(Lexer *lexer)
{
	lexer->current--;
}

// Store a line ending (and current line start at the current character)
void lexer_store_line_end(Lexer *lexer)
{
	lexer->current_line++;
	lexer->line_start = lexer->current + 1;
	source_file_append_line_end(lexer->current_file, (SourceLoc)(lexer->current_file->start_id + lexer->current - lexer->file_begin));
}

// Peek one character ahead.
static inline char peek_next(Lexer *lexer)
{
	return lexer->current[1];
}

// Return the current character and step one character forward.
static inline char next(Lexer *lexer)
{
	return *(lexer->current++);
}

// Skip the x next characters.
static inline void skip(Lexer *lexer, int steps)
{
	assert(steps > 0);
	lexer->current += steps;
}

// Is the current character '\0' if so we assume we reached the end.
static inline bool reached_end(Lexer *lexer)
{
	return *lexer->current == '\0';
}

// Match a single character – if successful, more one step forward.
static inline bool match(Lexer *lexer, char expected)
{
	if (reached_end(lexer)) return false;
	if (*lexer->current != expected) return false;
	lexer->current++;
	return true;
}

// --- Token creation

/**
 * Allocate data for a token, including source location.
 * This call is doing the basic allocation, with other functions
 * filling out additional information.
 **/
static inline void add_generic_token(Lexer *lexer, TokenType type)
{
	// Allocate source location, type, data for the token
	// each of these use their own arena,
	// causing them to be allocated directly into
	// what amounts to a huge array.
	// Consequently these allocs are actually simultaneously
	// allocating data and putting that data in an array.
	SourceLocation *location = sourceloc_alloc();
	unsigned char *token_type = (unsigned char *)toktype_alloc();
	TokenData *data = tokdata_alloc();
	*token_type = (unsigned char)type;

	// Set the location.
	location->file = lexer->current_file;
	location->start = (uint32_t)(lexer->lexing_start - lexer->file_begin);

	// Calculate the column
	if (lexer->lexing_start < lexer->line_start)
	{
		// In this case lexing started before the start of the current line.
		// Start by looking at the previous line.
		SourceLoc *current = &lexer->current_file->lines[lexer->current_line - 1];
		location->line = lexer->current_line;
		// Walk upwards until we find a line that starts before the current.
		while (*current > location->start)
		{
			location->line--;
			current--;
		}
		// We found the line we wanted, so the col is just an offset from the start.
		location->col = location->start - *current + 1;
		// Length is restricted to the end of the line.
		location->length = current[1] - current[0] - 1;
	}
	else
	{
		// The simple case, where the parsing started on the current line.
		location->line = lexer->current_line;
		// Col is simple difference.
		location->col = (unsigned) (lexer->lexing_start - lexer->line_start) + 1;
		// Start is offset to file begin.
		location->start = (SourceLoc) (lexer->lexing_start - lexer->file_begin);
		// Length is diff between current and start.
		location->length = (SourceLoc) (lexer->current - lexer->lexing_start);
	}
	// Return pointers to the data and the location,
	// these maybe be used to fill in data.
	lexer->latest_token_data = data;
	lexer->latest_token_loc = location;
	lexer->latest_token_type = token_type;
}

// Error? We simply generate an invalid token and print out the error.
static bool add_error_token(Lexer *lexer, const char *message, ...)
{
	add_generic_token(lexer, TOKEN_INVALID_TOKEN);
	va_list list;
	va_start(list, message);
	sema_verror_range(lexer->latest_token_loc, message, list);
	va_end(list);
	return false;
}

static bool add_error_token_at(Lexer *lexer, const char *loc, uint32_t len, const char *message, ...)
{
	va_list list;
	va_start(list, message);
	SourceLocation location = { .file = lexer->current_file,
								.start = (uint32_t) (loc - lexer->file_begin),
								.line = lexer->current_line,
								.length = len,
								.col = (uint32_t) (loc - lexer->line_start) + 1,
								};
	sema_verror_range(&location, message, list);
	va_end(list);
	add_generic_token(lexer, TOKEN_INVALID_TOKEN);
	return false;

}
// Add a new regular token.
static bool add_token(Lexer *lexer, TokenType type, const char *string)
{
	add_generic_token(lexer, type);
	lexer->latest_token_data->string = string;
	return true;
}


// --- Comment parsing

/**
 * Parsing of the "//" line comment,
 * also handling "///" doc comments that we probably don't need,
 * but let's keep it for now.
 */
static inline bool parse_line_comment(Lexer *lexer)
{
	// // style comment
	// Skip forward to the end.

	/// is a doc line comment.
	TokenType comment_type = match(lexer, '/') ? TOKEN_DOC_COMMENT : TOKEN_COMMENT;

	while (!reached_end(lexer) && peek(lexer) != '\n')
	{
		next(lexer);
	}

	bool success = add_token(lexer, comment_type, lexer->lexing_start);

	// If we found EOL, then walk past '\n'
	if (!reached_end(lexer))
	{
		lexer_store_line_end(lexer);
		next(lexer);
	}
	return success;
}


/**
 * Parse the common / *  * / style multiline comments
 **/
static inline bool parse_multiline_comment(Lexer *lexer)
{
	TokenType type = peek(lexer) == '*' && peek_next(lexer) != '/' ? TOKEN_DOC_COMMENT : TOKEN_COMMENT;
	int nesting = 1;
	while (1)
	{
		switch (peek(lexer))
		{
			case '*':
				if (peek_next(lexer) == '/')
				{
					skip(lexer, 2);
					nesting--;
					if (nesting == 0) return add_token(lexer, type, lexer->lexing_start);
					continue;
				}
				break;
			case '/':
				if (peek_next(lexer) == '*')
				{
					skip(lexer, 2);
					nesting++;
					continue;
				}
				break;
			case '\n':
				lexer_store_line_end(lexer);
				break;
			case '\0':
				return add_error_token(lexer, "Missing '*/' to end the multiline comment.");
			default:
				break;
		}
		next(lexer);
	}
}


/**
 * Skip regular whitespace.
 */
static void skip_whitespace(Lexer *lexer, LexMode lex_type)
{
	while (1)
	{
		switch (peek(lexer))
		{
			case '\n':
				if (lex_type != LEX_NORMAL) return;
				lexer_store_line_end(lexer);
				FALLTHROUGH;
			case ' ':
			case '\t':
			case '\f':
				next(lexer);
				break;
			case '\r':
				UNREACHABLE
			default:
				return;
		}
	}
}


// --- Identifier scanning


// Parses identifiers. Note that this is a bit complicated here since
// we split identifiers into 2 types + find keywords.
static inline bool scan_ident(Lexer *lexer, TokenType normal, TokenType const_token, TokenType type_token, char prefix)
{
	TokenType type = (TokenType)0;
	uint32_t hash = FNV1_SEED;
	if (prefix)
	{
		hash = FNV1a(prefix, hash);
	}
	while (peek(lexer) == '_')
	{
		hash = FNV1a(next(lexer), hash);
	}
	while (1)
	{
		switch (peek(lexer))
		{
			case 'a': case 'b': case 'c': case 'd': case 'e':
			case 'f': case 'g': case 'h': case 'i': case 'j':
			case 'k': case 'l': case 'm': case 'n': case 'o':
			case 'p': case 'q': case 'r': case 's': case 't':
			case 'u': case 'v': case 'w': case 'x': case 'y':
			case 'z':
				if (!type)
				{
					type = normal;
				}
				else if (type == const_token)
				{
					type = type_token;
				}
				break;
			case 'A': case 'B': case 'C': case 'D': case 'E':
			case 'F': case 'G': case 'H': case 'I': case 'J':
			case 'K': case 'L': case 'M': case 'N': case 'O':
			case 'P': case 'Q': case 'R': case 'S': case 'T':
			case 'U': case 'V': case 'W': case 'X': case 'Y':
			case 'Z':
				if (!type) type = const_token;
				break;
			case '0': case '1': case '2': case '3': case '4':
			case '5': case '6': case '7': case '8': case '9':
				if (!type) return add_error_token(lexer, "A letter must precede any digit");
			case '_':
				break;
			default:
				goto EXIT;
		}
		hash = FNV1a(next(lexer), hash);
	}
	// Allow bang!
	if (peek(lexer) == '!' && type == normal)
	{
		hash = FNV1a(next(lexer), hash);
	}
	EXIT:;
	uint32_t len = (uint32_t)(lexer->current - lexer->lexing_start);
	if (!type)
	{
		if (!prefix && len == 1) return add_token(lexer, TOKEN_UNDERSCORE, "_");
		add_error_token(lexer, "An identifier may not consist of only '_' characters.");
	}
	const char* interned_string = symtab_add(lexer->lexing_start, len, hash, &type);
	return add_token(lexer, type, interned_string);
}

// --- Number scanning

static bool scan_number_suffix(Lexer *lexer, bool *is_float)
{
	if (!is_alphanum_(peek(lexer))) return true;
	switch (peek(lexer))
	{
		case 'u':
		case 'U':
		case 'I':
		case 'i':
			if (*is_float)
			{
				return add_error_token(lexer, "Integer suffix '%x' is not valid for a floating point literal.", peek(lexer));
			}
			next(lexer);
			while (is_number(peek(lexer))) next(lexer);
			break;
		case 'f':
			*is_float = true;
			next(lexer);
			while (is_number(peek(lexer))) next(lexer);
			break;
		default:
			break;
	}
	if (is_alphanum_(peek(lexer)))
	{
		return add_error_token(lexer, "This doesn't seem to be a valid literal.");
	}
	return true;
}
/**
 * Parsing octals. Here we depart from the (error prone) C style octals with initial zero e.g. 0231
 * Instead we only support 0o prefix like 0o231. Note that lexing here doesn't actually parse the
 * number itself.
 */
static bool scan_oct(Lexer *lexer)
{
	if (!is_oct(next(lexer)))
	{
		backtrack(lexer);
		return add_error_token_at(lexer, lexer->current, 1, "An expression starting with '0o' should be followed by octal numbers (0-7).");
	}
	while (is_oct_or_(peek(lexer))) next(lexer);
	bool is_float = false;
	if (!scan_number_suffix(lexer, &is_float)) return false;
	if (is_float)
	{
		return add_error_token(lexer, "Octal literals cannot have a floating point suffix.");
	}
	return add_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
}

/**
 * Binary style literals e.g. 0b10101011
 **/
static bool scan_binary(Lexer *lexer)
{
	if (!is_binary(next(lexer)))
	{
		backtrack(lexer);
		return add_error_token_at(lexer, lexer->current, 1, "An expression starting with '0b' should be followed by binary digits (0-1).");
	}
	while (is_binary_or_(peek(lexer))) next(lexer);
	bool is_float = false;
	if (!scan_number_suffix(lexer, &is_float)) return false;
	if (is_float)
	{
		return add_error_token(lexer, "Binary literals cannot have a floating point suffix.");
	}
	return add_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
}

/**
 * Scan the digit after the exponent, e.g +12 or -12 or 12
 * @param lexer
 * @return false if lexing failed.
 */
static inline bool scan_exponent(Lexer *lexer)
{
	// Step past e/E or p/P
	next(lexer);
	char c = next(lexer);
	// Step past +/-
	if (c == '+' || c == '-') c = next(lexer);
	// Now we need at least one digit
	if (!is_digit(c))
	{
		if (c == 0)
		{
			backtrack(lexer);
			return add_error_token(lexer, "End of file was reached while parsing the exponent.");
		}
		if (c == '\n') return add_error_token(lexer, "End of line was reached while parsing the exponent.");
		if (c < 31 || c > 127) add_error_token(lexer, "An unexpected character was found while parsing the exponent.");
		return add_error_token(lexer, "Parsing the floating point exponent failed, because '%c' is not a number.", c);
	}
	// Walk through all of the digits.
	while (is_digit(peek(lexer))) next(lexer);
	return true;
}

/**
 * Scan a hex number, including floating point hex numbers of the format 0x31a31ff.21p12. Note that the
 * exponent is written in decimal.
 **/
static inline bool scan_hex(Lexer *lexer)
{
	if (!is_hex(next(lexer)))
	{
		backtrack(lexer);
		return add_error_token_at(lexer, lexer->current, 1, "'0x' starts a hexadecimal number, so the next character should be 0-9, a-f or A-F.");
	}
	while (is_hex_or_(peek(lexer))) next(lexer);
	bool is_float = false;
	if (peek(lexer) == '.' && peek_next(lexer) != '.')
	{
		is_float = true;
		next(lexer);
		char c = peek(lexer);
		if (c == '_') return add_error_token(lexer, "Can't parse this as a floating point value due to the '_' directly after decimal point.");
		if (is_hex(c)) next(lexer);
		while (is_hex_or_(peek(lexer))) next(lexer);
	}
	char c = peek(lexer);
	if (c == 'p' || c == 'P')
	{
		is_float = true;
		if (!scan_exponent(lexer)) return false;
	}
	if (prev(lexer) == '_') return add_error_token(lexer, "The number ended with '_', but that character needs to be between, not after, digits.");
	if (!scan_number_suffix(lexer, &is_float)) return false;
	return add_token(lexer, is_float ? TOKEN_REAL : TOKEN_INTEGER, lexer->lexing_start);
}

/**
 * Scans integer and float decimal values.
 */
static inline bool scan_dec(Lexer *lexer)
{
	assert(is_digit(peek(lexer)));

	// Walk through the digits, we don't need to worry about
	// initial _ because we only call this if we have a digit initially.
	while (is_digit_or_(peek(lexer))) next(lexer);

	// Assume no float.
	bool is_float = false;

	// If we have a single dot, we assume that we have a float.
	// Note that this current parsing means we can't have functions on
	// literals, like "123.sizeof", but we're fine with that.
	if (peek(lexer) == '.' && peek_next(lexer) != '.')
	{
		is_float = true;
		// Step past '.'
		next(lexer);
		// Check our rule to disallow 123._32
		char c = peek(lexer);
		if (c == '_') return add_error_token(lexer, "Can't parse this as a floating point value due to the '_' directly after decimal point.");
		// Now walk until we see no more digits.
		// This allows 123. as a floating point number.
		while (is_digit_or_(peek(lexer))) next(lexer);
	}
	char c = peek(lexer);
	// We might have an exponential. We allow 123e1 and 123.e1 as floating point, so
	// just set it to floating point and check the exponential.
	if (c == 'e' || c == 'E')
	{
		is_float = true;
		if (!scan_exponent(lexer)) return false;
	}

	if (prev(lexer) == '_') return add_error_token(lexer, "The number ended with '_', but that character needs to be between, not after, digits.");
	if (!scan_number_suffix(lexer, &is_float)) return false;
	return add_token(lexer, is_float ? TOKEN_REAL : TOKEN_INTEGER, lexer->lexing_start);
}

/**
 * Scan a digit, switching on initial zero on possible parsing schemes:
 * 0x... -> Hex
 * 0o... -> Octal
 * 0b... -> Binary
 *
 * Default is decimal.
 *
 * It's actually pretty simple to add encoding schemes here, so for example Base64 could
 * be added.
 */
static inline bool scan_digit(Lexer *lexer)
{
	if (peek(lexer) == '0')
	{
		switch (peek_next(lexer))
		{
			case 'x':
			case 'X':
				skip(lexer, 2);
				return scan_hex(lexer);
			case 'o':
			case 'O':
				skip(lexer, 2);
				return scan_oct(lexer);
			case 'b':
			case 'B':
				skip(lexer, 2);
				return scan_binary(lexer);
			default:
				break;
		}
	}
	return scan_dec(lexer);
}

// --- Character & string scan

static inline int64_t scan_hex_literal(Lexer *lexer, int positions)
{
	int64_t hex = 0;
	for (int j = 0; j < positions; j++)
	{
		hex <<= 4U;
		int i = char_to_nibble(peek(lexer));
		if (i < 0)
		{
			return -1;
		}
		next(lexer);
		hex += i;
	}
	return hex;
}

static inline int64_t scan_utf8(Lexer *lexer, unsigned char c)
{
	int utf8_bytes;
	uint64_t result;
	if (c < 0xc0) goto ERROR;
	if (c <= 0xdf)
	{
		result = 0x1f & c;
		utf8_bytes = 2;
	}
	else if (c <= 0xef)
	{
		result = 0xf & c;
		utf8_bytes = 3;
	}
	else if (c <= 0xf7)
	{
		utf8_bytes = 4;
		result = 0x7 & c;
	}
	else if (c <= 0xfb)
	{
		utf8_bytes = 5;
		result = 0x3 & c;
	}
	else if (c <= 0xfd)
	{
		utf8_bytes = 6;
		result = 0x1 & c;
	}
	else
	{
		goto ERROR;
	}
	for (int i = 1; i < utf8_bytes; i++)
	{
		result <<= 6U;
		if (peek(lexer) == '\0') return 0xFFFD;
		c = (unsigned char)next(lexer);
		if ((c & 0xc0) != 0x80)
		{
			goto ERROR;
		}
		result += c & 0x3f;
	}
	return (int64_t)result;
ERROR:
	add_error_token(lexer, "Invalid UTF-8 sequence.");
	return -1;
}

/**
 * Rules:
 * 1. If ASCII or \xAB, accept up to 8 characters (16 with Int128 support), size is char, ushort, uint, ulong, UInt128
 * 2. If UTF8, accept 1 UTF character, size is ushort normally, ulong on wide UTF characters, no additional characters accepted.
 * 3. If \uABCD, convert to 16 bits, size is ushort, no additional characters accepted.
 * 4. If \U01234567, convert to 32 bits, size is uint, no additional characters accepted.
 *
 * @param lexer
 * @return
 */
static inline bool scan_char(Lexer *lexer)
{

	// Handle the problem with zero size character literal first.
	if (match(lexer, '\''))
	{
		return add_error_token(lexer, "The character literal was empty.");
	}

	int width = 0;
	char c;
	Int128 b = { 0, 0 };

	while ((c = next(lexer)) != '\'')
	{
		// End of file may occur:
		if (c == '\0')
		{
			backtrack(lexer);
			return add_error_token(lexer, "The character literal did not terminate.");
		}
		// We might exceed the width that we allow.
		if (width > 15) return add_error_token(lexer, "The character literal exceeds 16 characters.");
		// Handle (expected) utf-8 characters.
		if ((unsigned)c >= (unsigned)0x80)
		{
			if (width != 0) goto UNICODE_IN_MULTI;
			const char *start = lexer->current;
			int64_t utf8 = scan_utf8(lexer, (unsigned char)c);
			if (utf8 < 0) return false;
			if (!match(lexer, '\''))
			{
				if (peek(lexer) == '\0') continue;
				lexer->lexing_start = start;
				return add_error_token(lexer, "Unicode character literals may only contain one character, "
											  "please remove the additional ones or use all ASCII.");
			}
			b.low = (uint64_t) utf8;
			width = utf8 > 0xffff ? 4 : 2;
			goto DONE;
		}
		// Parse the escape code
		signed char escape = ' ';
		const char *start = lexer->current;
		if (c == '\\')
		{
			assert(c == '\\');
			c = next(lexer);
			escape = is_valid_escape(c);
			if (escape == -1)
			{
				backtrack(lexer);
				lexer->lexing_start = start - 1;
				if (c > ' ' && c <= 127)
				{
					return add_error_token(lexer, "Invalid escape sequence '\\%c'.", c);
				}
				return add_error_token_at(lexer, start, 1, "An escape sequence was expected after '\\'.");
			}
		}
		switch (escape)
		{
			case 'x':
			{
				int64_t hex = scan_hex_literal(lexer, 2);
				if (hex < 0)
				{
					lexer->lexing_start = start - 1;
					// Fix underlining if this is an unfinished escape.
					return add_error_token(lexer, "Expected a two character hex value after \\x.");
				}
				// We can now reassign c and use the default code.
				c = (char)hex;
				break;
			}
			case 'u':
			case 'U':
			{
				// First check that we don't have any characters previous to this one.
				if (width != 0) goto UNICODE_IN_MULTI;
				int bytes = escape == 'U' ? 4 : 2;
				int64_t hex = scan_hex_literal(lexer, bytes * 2);
				// The hex parsing may have failed, lacking more hex chars.
				if (hex < 0)
				{
					lexer->lexing_start = start - 1;
					return add_error_token(lexer, "Expected %s character hex value after \\%c.",
										   escape == 'u' ? "a four" : "an eight", escape);
				}
				// If we don't see the end here, then something is wrong.
				if (!match(lexer, '\''))
				{
					// It may be the end of the line, if so use the default handling by invoking "continue"
					if (peek(lexer) == '\0') continue;
					// Otherwise step forward and mark it as an error.
					next(lexer);
					lexer->lexing_start = lexer->current - 1;
					return add_error_token(lexer,
					                       "Character literals with '\\%c' can only contain one character, please remove this one.",
					                       escape);
				}
				// Assign the value and go to DONE.
				b.low = (uint64_t) hex;
				width = bytes;
				goto DONE;
			}
			case ' ':
				// No escape, a regular character.
				break;
			default:
				c = (signed char)escape;
				break;
		}
		// Default handling here:
		width++;
		b = i128_shl64(b, 8);
		b = i128_add64(b, (unsigned char)c);
	}

	assert(width > 0 && width <= 16);
	if (width > 8 && !platform_target.int128)
	{
		return add_error_token(lexer, "Character literal exceeded 8 characters.");
	}
DONE:
	add_generic_token(lexer, TOKEN_CHAR_LITERAL);
	lexer->latest_token_data->char_value = b;
	lexer->latest_token_data->width = (char)width;
	return true;

UNICODE_IN_MULTI:
	return add_error_token(lexer, "A multi-character literal may not contain unicode characters.");
}

static inline void skip_first_line_if_empty(Lexer *lexer)
{
	// Start at the current token.
	const char *current = lexer->current;
	while (1)
	{
		switch (*(current++))
		{
			case '\n':
				// Line end? then we jump to the first token after line end.
				lexer->current = current - 1;
				lexer_store_line_end(lexer);
				lexer->current++;
				return;
			case ' ':
			case '\t':
			case '\f':
				// Counts as whitespace.
				break;
			case '\r':
				UNREACHABLE
			default:
				// Non whitespace -> no skip.
				return;
		}
	}
}

static int append_esc_string_token(char *restrict dest, const char *restrict src, size_t *pos)
{
	int scanned;
	uint64_t unicode_char;
	signed char scanned_char = is_valid_escape(src[0]);
	if (scanned_char < 0) return -1;
	switch (scanned_char)
	{
		case 'x':
		{
			int h = char_to_nibble(src[1]);
			if (h < 0) return -1;
			int l = char_to_nibble(src[2]);
			if (l < 0) return -1;
			unicode_char = ((unsigned) h << 4U) + (unsigned)l;
			scanned = 3;
			break;
		}
		case 'u':
		{
			int x1 = char_to_nibble(src[1]);
			if (x1 < 0) return -1;
			int x2 = char_to_nibble(src[2]);
			if (x2 < 0) return -1;
			int x3 = char_to_nibble(src[3]);
			if (x3 < 0) return -1;
			int x4 = char_to_nibble(src[4]);
			if (x4 < 0) return -1;
			unicode_char = ((unsigned) x1 << 12U) + ((unsigned) x2 << 8U) + ((unsigned) x3 << 4U) + (unsigned)x4;
			scanned = 5;
			break;
		}
		case 'U':
		{
			int x1 = char_to_nibble(src[1]);
			if (x1 < 0) return -1;
			int x2 = char_to_nibble(src[2]);
			if (x2 < 0) return -1;
			int x3 = char_to_nibble(src[3]);
			if (x3 < 0) return -1;
			int x4 = char_to_nibble(src[4]);
			if (x4 < 0) return -1;
			int x5 = char_to_nibble(src[5]);
			if (x5 < 0) return -1;
			int x6 = char_to_nibble(src[6]);
			if (x6 < 0) return -1;
			int x7 = char_to_nibble(src[7]);
			if (x7 < 0) return -1;
			int x8 = char_to_nibble(src[8]);
			if (x8 < 0) return -1;
			unicode_char = ((unsigned) x1 << 28U) + ((unsigned) x2 << 24U) + ((unsigned) x3 << 20U) + ((unsigned) x4 << 16U) +
					((unsigned) x5 << 12U) + ((unsigned) x6 << 8U) + ((unsigned) x7 << 4U) + (unsigned)x8;
			scanned = 9;
			break;
		}
		default:
			dest[(*pos)++] = scanned_char;
			return 1;
	}
	if (unicode_char < 0x80U)
	{
		dest[(*pos)++] = (char)unicode_char;
	}
	else if (unicode_char < 0x800U)
	{
		dest[(*pos)++] = (char)(0xC0U | (unicode_char >> 6U));
		dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
	}
	else if (unicode_char < 0x10000U)
	{
		dest[(*pos)++] = (char)(0xE0U | (unicode_char >> 12U));
		dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 6U) & 0x3FU));
		dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
	}
	else
	{
		dest[(*pos)++] = (char)(0xF0U | (unicode_char >> 18U));
		dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 12U) & 0x3FU));
		dest[(*pos)++] = (char)(0x80U | ((unicode_char >> 6U) & 0x3FU));
		dest[(*pos)++] = (char)(0x80U | (unicode_char & 0x3FU));
	}
	return scanned;
}


static inline size_t scan_multiline_indent(const char *current, const char **end_ref, int32_t *min_indent_ref)
{
	// 3. Initial scan.
	char c;
	bool multi_line = false;
	int32_t current_indent = 0;
	int32_t min_indent = INT32_MAX;
	size_t len = 0;
	while ((c = (current++)[0]) != '\0')
	{
		if (c == '"' && current[0] == '"' && current[1] == '"') break;
		// 1. If we've only seen whitespace so far
		if (current_indent >= 0)
		{
			// 2. More whitespace, so increase indent
			if (is_whitespace(c))
			{
				if (c == ' ' || c == '\t') current_indent++;
			}
			else
			{
				// 3. Otherwise, update if smaller before
				if (current_indent < min_indent) min_indent = current_indent;
				// 4. And disable further tracking.
				current_indent = -1;
			}
			// 5. Just continue if escape, this makes
			//    escape automatically track as non-whitespace
			if (c == '\\') continue;
		}
		// 6. On new line, set multi_line to true and reset indent.
		if (c == '\n')
		{
			multi_line = true;
			current_indent = 0;
		}
		// 7. Increase our conservative estimate of the length
		//    which does not properly take into account indent
		//    and escapes.
		len++;
	}

	// 8. If we ended on EOF
	if (c == '\0')
	{
		current--;
		*end_ref = current;
		*min_indent_ref = 0;
		return len;
	}
	// 8. We're stopping at the second '"' so we need to back up 1
	current -= 1;

	// 10. We have four cases:
	//     a. Single row -> no action
	//     b. Characters on same line before ending chars -> no action
	//     c. No space or characters before the ending chars
	//     d. Space before the ending chars

	// 14. This will handle c & d
	if (multi_line && current_indent >= 0)
	{
		// Just walk back until '\n' is found.
		while (current[0] != '\n') current--;
	}

	*end_ref = current;
	*min_indent_ref = min_indent == INT32_MAX ? 0 : min_indent;
	return len;
}

bool scan_consume_end_of_multiline(Lexer *lexer, bool error_on_eof)
{
	int consume_end = 3;
	while (consume_end > 0)
	{
		char c = next(lexer);
		if (c == '\0')
		{
			backtrack(lexer);
			if (!error_on_eof) return false;
			return add_error_token_at(lexer, lexer->current - 1, 1, "The multi-line string unexpectedly ended. "
																 "Did you forget a '\"\"\"' somewhere?");
		}
		if (c == '"') consume_end--;
	}
	return true;
}

/**
 * Scan a multi-line string between """ ... """
 * - Remove initial newline & space on the first """
 *   if the text does not start on the first row.
 * - Remove space before the last """ if the text
 *   does not end on the last row.
 * - Remove last trailing \n
 * - Skip \r
 *
 * @param lexer
 * @return
 */
static inline bool scan_multiline_string(Lexer *lexer)
{
	// 1. Step past '""'
	next(lexer);
	next(lexer);

	// 2. See if the first line only has space and line end.
	skip_first_line_if_empty(lexer);

	// 3. Perform a scan to determine actual start and end of what we want
	//    to parse
	const char *end;
	int32_t min_indent;
	size_t len = scan_multiline_indent(lexer->current, &end, &min_indent);

	// Allocate result
	char *destination = malloc_arena(len + 1);

	int line = 0;
	char c;
	len = 0;
	while (lexer->current < end)
	{
		c = peek(lexer);

		// Ok, we reached the end of line
		// update the line end and store it in the resulting buffer.
		if (c == '\n')
		{
			lexer_store_line_end(lexer);
			next(lexer);
			destination[len++] = c;
			line = 0;
			continue;
		}

		// By now it's safe to advance one step.
		next(lexer);
		line++;

		// We reached EOF, or escape + end of file.
		if (c == '\0' || (c == '\\' && peek(lexer) == '\0'))
		{
			return add_error_token_at(lexer, lexer->current - 1, 1, "The multi-line string unexpectedly ended. "
			                                                     "Did you forget a '\"\"\"' somewhere?");
		}

		// An escape sequence was reached.
		if (c == '\\')
		{
			// Handle the empty escape: we simply skip.
			if (peek(lexer) == '|')
			{
				next(lexer);
				continue;
			}
			int scanned = append_esc_string_token(destination, lexer->current, &len);
			if (scanned < 0)
			{
				add_error_token_at(lexer, lexer->current - 1, 2, "Invalid escape in string.");
				scan_consume_end_of_multiline(lexer, false);
				return false;
			}
			lexer->current += scanned;
			continue;
		}
		// Now first we skip any empty space if line has not been reached.
		if (line <= min_indent)
		{
			assert(is_whitespace(c));
			continue;
		}
		destination[len++] = c;
	}
	if (!scan_consume_end_of_multiline(lexer, true)) return false;
	destination[len] = 0;
	add_token(lexer, TOKEN_STRING, destination);
	lexer->latest_token_data->strlen = len;
	return true;
}

static inline void consume_to_end_quote(Lexer *lexer)
{
	char c;
	while ((c = peek(lexer)) != '\0' && c != '"')
	{
		if (c == '\n')
		{
			lexer_store_line_end(lexer);
		}
		next(lexer);
	}
}

static inline bool scan_string(Lexer *lexer)
{
	if (peek(lexer) == '"' && peek_next(lexer) == '"')
	{
		return scan_multiline_string(lexer);
	}
	char c = 0;
	const char *current = lexer->current;
	while ((c = *(current++)) != '"')
	{
		if (c == '\n' || c == '\0')
		{
			current++;
			break;
		}
		if (c == '\\' && *current == '"')
		{
			current++;
			continue;
		}
	}
	const char *end = current - 1;
	char *destination = malloc_arena((size_t)(end - lexer->current + 1));
	size_t len = 0;
	while (lexer->current < end)
	{
		c = next(lexer);
		if (c == '\0' || (c == '\\' && peek(lexer) == '\0'))
		{
			if (c == '\0') backtrack(lexer);
			add_error_token_at(lexer, lexer->current - 1, 1, "The end of the file was reached "
			                                                 "while parsing the string. "
			                                                 "Did you forget (or accidentally add) a '\"' somewhere?");
			consume_to_end_quote(lexer);
			return false;
		}
		if (c == '\n' || (c == '\\' && peek(lexer) == '\n'))
		{
			add_error_token_at(lexer, lexer->current - 1, 1, "The end of the line was reached "
			                                                 "while parsing the string. "
			                                                 "Did you forget (or accidentally add) a '\"' somewhere?");
			lexer->current--;
			consume_to_end_quote(lexer);
			return false;
		}
		if (c == '\\')
		{
			int scanned = append_esc_string_token(destination, lexer->current, &len);
			if (scanned < 0)
			{
				add_error_token_at(lexer, lexer->current - 1, 2, "Invalid escape in string.");
				consume_to_end_quote(lexer);
				return false;
			}
			lexer->current += scanned;
			continue;
		}
		destination[len++] = c;
	}
	// Skip the `"`
	next(lexer);
	destination[len] = 0;
	add_token(lexer, TOKEN_STRING, destination);
	lexer->latest_token_data->strlen = len;
	return true;
}

static inline bool scan_raw_string(Lexer *lexer)
{
	char c;
	while ((c = next(lexer)) != '`' || peek(lexer) == '`')
	{
		if (c == '\0')
		{
			backtrack(lexer);
			return add_error_token_at(lexer, lexer->lexing_start , 1, "Reached the end of the file looking for "
																	  "the end of the raw string that starts "
																	  "here. Did you forget a '`' somewhere?");
		}
		if (c == '`') next(lexer);
	}
	const char *current = lexer->lexing_start + 1;
	const char *end = lexer->current - 1;
	size_t len = (size_t)(end - current);
	char *destination = malloc_arena(len + 1);
	len = 0;
	while (current < end)
	{
		c = *(current++);
		if (c == '`' && current[0] == '`')
		{
			current++;
		}
		destination[len++] = c;
	}
	destination[len] = 0;
	add_token(lexer, TOKEN_STRING, destination);
	lexer->latest_token_data->strlen = len;
	return true;
}

static inline bool scan_hex_array(Lexer *lexer)
{
	char start_char = next(lexer); // Step past ' or "
	const char *hexdata = lexer->current;
	char c;
	uint64_t len = 0;
	while (1)
	{
		c = next(lexer);
		if (c == start_char) break;
		if (c == 0)
		{
			backtrack(lexer);
			lexer->lexing_start = lexer->current - 1;
			return add_error_token(lexer, "The hex string seems to be missing a terminating '%c'", start_char);
		}
		if (is_hex(c))
		{
			len++;
			continue;
		}
		if (!is_whitespace(c))
		{
			lexer->lexing_start = hexdata - 1;
			lexer->current = hexdata;
			return add_error_token(lexer,
			                       "'%c' isn't a valid hexadecimal digit, all digits should be a-z, A-Z and 0-9.",
			                       c);
		}
	}
	if (len % 2)
	{
		return add_error_token(lexer, "The hexadecimal string is not an even length, did you miss a digit somewhere?");
	}
	if (!add_token(lexer, TOKEN_BYTES, lexer->lexing_start)) return false;
	lexer->latest_token_data->is_base64 = false;
	lexer->latest_token_data->len = (uint64_t)len / 2;
	return true;
}

static inline bool scan_base64(Lexer *lexer)
{
	next(lexer); // Step past 6
	next(lexer); // Step past 4
	char start_char = next(lexer); // Step past ' or "
	const char *b64data = lexer->current;
	char c;
	unsigned end_len = 0;
	uint64_t len = 0;
	while (1)
	{
		c = next(lexer);
		if (c == start_char) break;
		if (c == 0)
		{
			backtrack(lexer);
			lexer->lexing_start = lexer->current - 1;
			return add_error_token(lexer, "The base64 string seems to be missing a terminating '%c'", start_char);
		}
		if (is_base64(c))
		{
			if (end_len)
			{
				lexer->lexing_start = lexer->current - 1;
				return add_error_token(lexer, "'%c' can't be placed after an ending '='", c);
			}
			len++;
			continue;
		}
		if (c == '=')
		{
			if (end_len > 3)
			{
				lexer->lexing_start = b64data - 1;
				lexer->current = b64data;
				return add_error_token(lexer, "There cannot be more than 3 '=' at the end of a base64 string.", c);
			}
			end_len++;
			continue;
		}
		if (!is_whitespace(c))
		{
			lexer->lexing_start = b64data - 1;
			lexer->current = b64data;
			return add_error_token(lexer, "'%c' is not a valid base64 character.", c);
		}
	}
	uint64_t decoded_len = (3 * len - end_len) / 4;
	if (!add_token(lexer, TOKEN_BYTES, lexer->lexing_start)) return false;
	lexer->latest_token_data->is_base64 = true;
	lexer->latest_token_data->len = decoded_len;
	return true;
}


// --- Lexer doc lexing

/**
 * Skip any stars until we either have no more * or we find '* /'
 * @param lexer
 */
static void skip_doc_stars(Lexer *lexer)
{
	while (peek(lexer) == '*' && peek_next(lexer) != '/') next(lexer);
}

static bool end_of_docs_found(Lexer *lexer)
{
	int lookahead = 0;
	// while we see '*' walk forward.
	while (lexer->current[lookahead] == '*') lookahead++;
	// And if it doesn't have a '/' at the last position it isn't either.
	return lexer->current[lookahead] == '/';
}
/**
 * OPTIONALLY adds * / token. This allows any number of '*' to preceed it.
 * @param lexer
 * @return
 */
static bool parse_add_end_of_docs_if_present(Lexer *lexer)
{
	int lookahead = 0;
	// while we see '*' walk forward.
	while (lexer->current[lookahead] == '*') lookahead++;
	// if we didn't see a '*' to begin with, then it's not an end
	if (lookahead < 1) return false;
	// And if it doesn't have a '/' at the last position it isn't either.
	if (lexer->current[lookahead] != '/') return false;
	// Otherwise, gladly skip ahead and store the end.
	skip(lexer, lookahead + 1);
	add_token(lexer, TOKEN_DOCS_END, lexer->lexing_start);
	lexer->lexing_start = lexer->current;
	return true;
}


static void parse_add_end_of_doc_line(Lexer *lexer)
{
	assert(peek(lexer) == '\n');
	// Add the EOL token.
	lexer_store_line_end(lexer);
	next(lexer);
	add_token(lexer, TOKEN_DOCS_EOL, lexer->lexing_start);
	lexer->lexing_start = lexer->current;
	// Skip whitespace
	skip_whitespace(lexer, LEX_DOCS);
	// And any leading stars:
	skip_doc_stars(lexer);
}

/**
 * Parse the end of a directive or a simple line, e.g.
 * For "* @param lexer The lexer used." then the remainder is "The lexer used."
 * For "*** Hello world" the remainder is "Hello world"
 */
static DocEnd parse_doc_remainder(Lexer *lexer)
{
	// Skip all initial whitespace.
	skip_whitespace(lexer, LEX_DOCS);
	lexer->lexing_start = lexer->current;

	int characters_read = 0;
	while (1)
	{
		switch (peek(lexer))
		{
			case '*':
				// Did we find the end of the directives?
				// If so return control.
				if (!end_of_docs_found(lexer)) break;

				if (characters_read > 0)
				{
					add_token(lexer, TOKEN_DOCS_LINE, 0);
					lexer->lexing_start = lexer->current;
				}
				if (parse_add_end_of_docs_if_present(lexer)) return DOC_END_LAST;
				// Otherwise use default parsing.
				break;
			case '\n':
				// End of line
				if (characters_read > 0)
				{
					add_token(lexer, TOKEN_DOCS_LINE, 0);
					lexer->lexing_start = lexer->current;
				}
				return DOC_END_EOL;
			case '\0':
				if (characters_read > 0)
				{
					add_token(lexer, TOKEN_DOCS_LINE, 0);
					lexer->lexing_start = lexer->current;
				}
				return DOC_END_EOF;
			default:
				break;
		}
		// Otherwise move forward
		characters_read++;
		next(lexer);
	}
}

static DocEnd parse_doc_error_directive(Lexer *lexer)
{
	while (1)
	{
		// Skip any whitespace.
		skip_whitespace(lexer, LEX_DOCS);

		// First scan the name
		if (!lexer_scan_token_inner(lexer, LEX_DOCS)) return DOC_END_ERROR;

		if (*lexer->latest_token_type != TOKEN_TYPE_IDENT) break;

		// Skip any whitespace.
		skip_whitespace(lexer, LEX_DOCS);

		// If we don't reach "|" we exit, since errors are composed using ErrorA | ErrorB
		if (peek(lexer) != '|') break;

		if (!lexer_scan_token_inner(lexer, LEX_DOCS)) return DOC_END_ERROR;

		// We might get "|=" or something, in that case exit.
		if (*lexer->latest_token_type != TOKEN_BIT_OR) break;
	}
	return parse_doc_remainder(lexer);
}

/**
 * Contract directives use the style: "@require a > 2, b && c == true : "Must work foo"
 *
 * @param lexer
 * @return
 */
static DocEnd parse_doc_contract_directive(Lexer *lexer)
{
	while (1)
	{
		// Skip all initial whitespace.
		skip_whitespace(lexer, LEX_DOCS);

		switch (peek(lexer))
		{
			case '*':
				// Did we find the end of the directives?
				// If so return control.
				if (parse_add_end_of_docs_if_present(lexer)) return DOC_END_LAST;
				// Otherwise use default parsing.
				break;
			case '\n':
				return DOC_END_EOL;
			case '\0':
				return DOC_END_EOF;
			default:
				break;
		}
		// Otherwise move forward
		if (!lexer_scan_token_inner(lexer, LEX_DOCS)) return DOC_END_ERROR;

		// "return" is an identifier inside.
		if (*lexer->latest_token_type == TOKEN_RETURN)
		{
			*lexer->latest_token_type = TOKEN_IDENT;
		}
	}
}

static DocEnd parse_doc_param_directive(Lexer *lexer)
{
	// Skip any whitespace.
	skip_whitespace(lexer, LEX_DOCS);

	// First scan the name
	if (!lexer_scan_token_inner(lexer, LEX_DOCS)) return DOC_END_ERROR;

	// Then the remainder
	return parse_doc_remainder(lexer);
}

static DocEnd parse_doc_directive(Lexer *lexer)
{
	// We expect a directive here.
	if (!is_letter(peek_next(lexer)))
	{
		return add_error_token(lexer, "Expected doc directive here.");
	}
	lexer->lexing_start = lexer->current;
	// First parse the '@'
	skip(lexer, 1);
	add_token(lexer, TOKEN_DOCS_DIRECTIVE, "@");
	lexer->lexing_start = lexer->current;

	// Then our keyword
	if (!scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST, TOKEN_TYPE_IDENT, 0)) return DOC_END_ERROR;

	assert(*lexer->latest_token_type == TOKEN_IDENT || *lexer->latest_token_type == TOKEN_RETURN);

	const char *last_token_string = lexer->latest_token_data->string;

	if (*lexer->latest_token_type == TOKEN_RETURN)
	{
		// Backpatch the type.
		*lexer->latest_token_type = TOKEN_IDENT;
		return parse_doc_remainder(lexer);
	}
	if (kw_errors == last_token_string)
	{
		return parse_doc_error_directive(lexer);
	}
	if (last_token_string == kw_require || last_token_string == kw_ensure || last_token_string == kw_reqparse)
	{
		return parse_doc_contract_directive(lexer);
	}
	if (last_token_string == kw_param)
	{
		// The variable
		return parse_doc_param_directive(lexer);
	}
	return parse_doc_remainder(lexer);
}

/**
 * Parse the / **  * / directives comments
 **/
static bool parse_doc_comment(Lexer *lexer)
{
	// Add the doc start token.
	add_token(lexer, TOKEN_DOCS_START, lexer->lexing_start);

	// Skip any additional stars
	skip_doc_stars(lexer);

	// Main "doc parse" loop.
	while (1)
	{
		// 1. Skip any whitespace
		skip_whitespace(lexer, LEX_DOCS);

		// 2. Did we find the end?
		if (reached_end(lexer))	return add_error_token(lexer, "Missing '*/' to end the doc comment.");

		// 3. See if we reach the end of the docs.
		if (parse_add_end_of_docs_if_present(lexer)) return true;

		DocEnd end;
		// Parse a segment
		switch (peek(lexer))
		{
			case '@':
				end = parse_doc_directive(lexer);
				break;
			case '\n':
				end = DOC_END_EOL;
				break;
			default:
				end = parse_doc_remainder(lexer);
				break;
		}

		// We're done parsing a line:
		switch (end)
		{
			case DOC_END_ERROR:
				return false;
			case DOC_END_EOF:
				// Just continue, this will be picked up in the beginning of the loop.
				break;
			case DOC_END_LAST:
				// We're done, so return.
				return true;
			case DOC_END_EOL:
				// Walk past the end of line.
				parse_add_end_of_doc_line(lexer);
				break;
			default:
				UNREACHABLE
		}
	}
}

// --- Lexer public functions


Token lexer_advance(Lexer *lexer)
{
	Token token = { .id.index = lexer->lexer_index, .type = (TokenType)(*toktypeptr(lexer->lexer_index)) };
	lexer->lexer_index++;
	return token;
}


static bool lexer_scan_token_inner(Lexer *lexer, LexMode mode)
{
	// Now skip the whitespace.
	skip_whitespace(lexer, mode);

	// Point start to the first non-whitespace character.
	lexer->lexing_start = lexer->current;

	if (reached_end(lexer))
	{
		assert(mode == LEX_NORMAL);
		return add_token(lexer, TOKEN_EOF, "\n") && false;
	}

	char c = next(lexer);
	switch (c)
	{
		case '@':
			return add_token(lexer, TOKEN_AT, "@");
		case '\'':
			return scan_char(lexer);
		case '`':
			return scan_raw_string(lexer);
		case '"':
			return scan_string(lexer);
		case '#':
			return scan_ident(lexer, TOKEN_HASH_IDENT, TOKEN_HASH_CONST_IDENT, TOKEN_HASH_TYPE_IDENT, '$');
		case '$':
			if (match(lexer, '{')) return add_token(lexer, TOKEN_PLACEHOLDER, "${");
			if (match(lexer, '$'))
			{
				if (is_letter(peek(lexer)))
				{
					add_token(lexer, TOKEN_BUILTIN, "$$");
					lexer->lexing_start = lexer->current;
					return scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
				}
				return add_error_token(lexer, "Expected a letter after $$.");
			}
			return scan_ident(lexer, TOKEN_CT_IDENT, TOKEN_CT_CONST_IDENT, TOKEN_CT_TYPE_IDENT, '$');
		case ',':
			return add_token(lexer, TOKEN_COMMA, ",");
		case ';':
			return add_token(lexer, TOKEN_EOS, ";");
		case '{':
			return match(lexer, '|') ? add_token(lexer, TOKEN_LBRAPIPE, "{|") : add_token(lexer, TOKEN_LBRACE, "{");
		case '}':
			return add_token(lexer, TOKEN_RBRACE, "}");
		case '(':
			return add_token(lexer, TOKEN_LPAREN, "(");
		case ')':
			return add_token(lexer, TOKEN_RPAREN, ")");
		case '[':
			if (match(lexer, '<')) return add_token(lexer, TOKEN_LVEC, "[<");
			return add_token(lexer, TOKEN_LBRACKET, "[");
		case ']':
			return add_token(lexer, TOKEN_RBRACKET, "]");
		case '.':
			if (match(lexer, '.'))
			{
				if (match(lexer, '.')) return add_token(lexer, TOKEN_ELLIPSIS, "...");
				return add_token(lexer, TOKEN_DOTDOT, "..");
			}
			return add_token(lexer, TOKEN_DOT, ".");
		case '~':
			return add_token(lexer, TOKEN_BIT_NOT, "~");
		case ':':
			return match(lexer, ':') ? add_token(lexer, TOKEN_SCOPE, "::") : add_token(lexer, TOKEN_COLON, ":");
		case '!':
			if (match(lexer, '!')) return add_token(lexer, TOKEN_BANGBANG, "!!");
			return match(lexer, '=') ? add_token(lexer, TOKEN_NOT_EQUAL, "!=") : add_token(lexer, TOKEN_BANG, "!");
		case '/':
			// We can't get any directives comments here.
			if (mode != LEX_DOCS)
			{
				if (match(lexer, '/')) return parse_line_comment(lexer);
				if (match(lexer, '*')) return match(lexer, '*') ? parse_doc_comment(lexer) : parse_multiline_comment(lexer);
			}
			return match(lexer, '=') ? add_token(lexer, TOKEN_DIV_ASSIGN, "/=") : add_token(lexer, TOKEN_DIV, "/");
		case '*':
			return match(lexer, '=') ? add_token(lexer, TOKEN_MULT_ASSIGN, "*=") : add_token(lexer, TOKEN_STAR, "*");
		case '=':
			return match(lexer, '=') ? add_token(lexer, TOKEN_EQEQ, "==") : add_token(lexer, TOKEN_EQ, "=");
		case '^':
			return match(lexer, '=') ? add_token(lexer, TOKEN_BIT_XOR_ASSIGN, "^=") : add_token(lexer,
			                                                                                    TOKEN_BIT_XOR,
			                                                                                    "^");
		case '?':
			if (match(lexer, '?')) return add_token(lexer, TOKEN_QUESTQUEST, "??");
			return match(lexer, ':') ? add_token(lexer, TOKEN_ELVIS, "?:") : add_token(lexer, TOKEN_QUESTION, "?");
		case '<':
			if (match(lexer, '<'))
			{
				if (match(lexer, '=')) return add_token(lexer, TOKEN_SHL_ASSIGN, "<<=");
				return add_token(lexer, TOKEN_SHL, "<<");
			}
			return match(lexer, '=') ? add_token(lexer, TOKEN_LESS_EQ, "<=") : add_token(lexer, TOKEN_LESS, "<");
		case '>':
			if (match(lexer, '>'))
			{
				if (match(lexer, '=')) return add_token(lexer, TOKEN_SHR_ASSIGN, ">>=");
				return add_token(lexer, TOKEN_SHR, ">>");
			}
			if (match(lexer, ']')) return add_token(lexer, TOKEN_RVEC, ">]");
			return match(lexer, '=') ? add_token(lexer, TOKEN_GREATER_EQ, ">=") : add_token(lexer, TOKEN_GREATER, ">");
		case '%':
			return match(lexer, '=') ? add_token(lexer, TOKEN_MOD_ASSIGN, "%=") : add_token(lexer, TOKEN_MOD, "%");
		case '&':
			if (match(lexer, '&')) return add_token(lexer, TOKEN_AND, "&&");
			return match(lexer, '=') ? add_token(lexer, TOKEN_BIT_AND_ASSIGN, "&=") : add_token(lexer, TOKEN_AMP, "&");
		case '|':
			if (match(lexer, '}')) return add_token(lexer, TOKEN_RBRAPIPE, "|}");
			if (match(lexer, '|')) return add_token(lexer, TOKEN_OR, "||");
			return match(lexer, '=') ? add_token(lexer, TOKEN_BIT_OR_ASSIGN, "|=") : add_token(lexer,
			                                                                                   TOKEN_BIT_OR,
			                                                                                   "|");
		case '+':
			if (match(lexer, '+')) return add_token(lexer, TOKEN_PLUSPLUS, "++");
			if (match(lexer, '=')) return add_token(lexer, TOKEN_PLUS_ASSIGN, "+=");
			return add_token(lexer, TOKEN_PLUS, "+");
		case '-':
			if (match(lexer, '>')) return add_token(lexer, TOKEN_ARROW, "->");
			if (match(lexer, '-')) return add_token(lexer, TOKEN_MINUSMINUS, "--");
			if (match(lexer, '=')) return add_token(lexer, TOKEN_MINUS_ASSIGN, "-=");
			return add_token(lexer, TOKEN_MINUS, "-");
		case 'b':
			if (peek(lexer) == '6' && peek_next(lexer) == '4' && (lexer->current[2] == '\'' || lexer->current[2] == '"'))
			{
				return scan_base64(lexer);
			}
			FALLTHROUGH;
		default:
			if (c == 'x' && (peek(lexer) == '"' || peek(lexer) == '\''))
			{
				return scan_hex_array(lexer);
			}
			if (is_alphanum_(c))
			{
				backtrack(lexer);
				return is_digit(c) ? scan_digit(lexer) : scan_ident(lexer, TOKEN_IDENT, TOKEN_CONST_IDENT, TOKEN_TYPE_IDENT, 0);
			}
			if (c < 0)
			{
				return add_error_token(lexer, "The 0%x character may not be placed outside of a string or comment, did you perhaps forget a \" somewhere?", (uint8_t)c);
			}
			return add_error_token(lexer, "'%c' may not be placed outside of a string or comment, did you perhaps forget a \" somewhere?", c);

	}
}

File* lexer_current_file(Lexer *lexer)
{
	return lexer->current_file;
}

#define tokenid(_ptr) ((unsigned)((TokenOld *)(_ptr) - ((TokenOld *)lexer->memory.ptr)))


void lexer_init_with_file(Lexer *lexer, File *file)
{
	file->token_start_id = (uint32_t) toktype_arena.allocated;
	lexer->current_file = file;
	lexer->file_begin = lexer->current_file->contents;
	lexer->lexing_start = lexer->file_begin;
	lexer->current = lexer->lexing_start;
	lexer->current_line = 1;
	lexer->line_start = lexer->current;
	lexer->lexer_index = file->token_start_id;
	const unsigned char *check = (const unsigned char *)lexer->current;
	unsigned c;
	int balance = 0;
	while ((c = *(check++)) != '\0')
	{
		if (c != 0xE2) continue;
		unsigned char type = check[1];
		switch (check[0])
		{
			case 0x80:
				if (type == 0xAC)
				{
					balance--;
					if (balance < 0) goto DONE;
				}
				if (type >= 0xAA && type <= 0xAE)
				{
					balance++;
				}
				break;
			case 0x81:
				if (type >= 0xA6 && type <= 0xA8)
				{
					balance++;
				}
				else if (type == 0xA9)
				{
					balance--;
					if (balance < 0) goto DONE;
				}
				break;
			default:
				break;
		}
	}
DONE:
	if (balance != 0)
	{
		add_error_token(lexer, "Invalid encoding - Unbalanced bidirectional markers.");
		return;
	}
	while(1)
	{
		if (!lexer_scan_token_inner(lexer, LEX_NORMAL))
		{
			if (reached_end(lexer)) break;
			while (!reached_end(lexer) && peek(lexer) != '\n') next(lexer);
			lexer->lexing_start = lexer->current;
			continue;
		}
	}

}