Lexer cleanup

2026-02-27 12:01:16 +00:00 · 2020-03-13 17:11:31 +01:00
parent ebbea2ac42
commit 2e3bbf119c
10 changed files with 94 additions and 123 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -18,8 +18,8 @@ Checks: >
  -google-runtime-references,
  -misc-non-private-member-variables-in-classes,
  -readability-named-parameter,
+  -readability-magic-numbers,
  -readability-braces-around-statements,
-  -readability-magic-numbers

 # Turn all the warnings from the checks above into errors.
 WarningsAsErrors: "*"
--- a/src/compiler/compiler.c
+++ b/src/compiler/compiler.c
@@ -21,7 +21,7 @@ static void compiler_lex(BuildTarget *target)
 		File *file = source_file_load(target->sources[i], &loaded);
 		if (loaded) continue;
 		Lexer lexer;
-		lexer_add_file_for_lexing(&lexer, file);
+		lexer_init_with_file(&lexer, file);
 		printf("# %s\n", file->full_path);
 		while (1)
 		{
--- a/src/compiler/compiler.h
+++ b/src/compiler/compiler.h
@@ -9,4 +9,5 @@
 void compiler_init();
 void compile_files(BuildTarget *target);
 void build();
+void symtab_init(uint32_t max_size);

--- a/src/compiler/compiler_internal.h
+++ b/src/compiler/compiler_internal.h
@@ -1003,10 +1003,9 @@ static inline bool func_has_error_return(FunctionSignature *func_sig)

 Token lexer_scan_token(Lexer *lexer);
 Token lexer_scan_ident_test(Lexer *lexer, const char *scan);
-void lexer_test_setup(Lexer *lexer, const char *text, size_t len);
-void lexer_add_file_for_lexing(Lexer *lexer, File *file);
+void lexer_init_for_test(Lexer *lexer, const char *text, size_t len);
+void lexer_init_with_file(Lexer *lexer, File *file);
 File* lexer_current_file(Lexer *lexer);
-void lexer_check_init(void);


 typedef enum
@@ -1059,7 +1058,6 @@ void *stable_get(STable *table, const char *key);
 void *stable_delete(STable *table, const char *key);
 void stable_clear(STable *table);

-void symtab_init(uint32_t max_size);
 const char *symtab_add(const char *symbol, uint32_t len, uint32_t fnv1hash, TokenType *type);

 void target_setup();
--- a/src/compiler/lexer.c
+++ b/src/compiler/lexer.c
@@ -4,11 +4,7 @@

 #include "compiler_internal.h"

-
-Token next_tok;
-Token tok;
-
-// --- Lexing general methods.
+#pragma mark --- Lexing general methods.

 static inline char peek(Lexer *lexer)
 {
@@ -30,17 +26,11 @@ void lexer_store_line_end(Lexer *lexer)
 	source_file_append_line_end(lexer->current_file, lexer->current_file->start_id + lexer->current - lexer->file_begin);
 }

-
 static inline char peek_next(Lexer *lexer)
 {
 	return lexer->current[1];
 }

-static inline char peek_next_next(Lexer *lexer)
-{
-	return lexer->current[2];
-}
-
 static inline char next(Lexer *lexer)
 {
 	return *(lexer->current++);
@@ -56,9 +46,12 @@ static inline bool reached_end(Lexer *lexer)
 	return *lexer->current == '\0';
 }

-static inline SourceLoc loc_from_ptr(Lexer *lexer, const char *start)
+static inline bool match(Lexer *lexer, char expected)
 {
-	return (SourceLoc) (lexer->current_file->start_id + (start - lexer->file_begin));
+	if (reached_end(lexer)) return false;
+	if (*lexer->current != expected) return false;
+	lexer->current++;
+	return true;
 }

 static inline SourceRange range_from_ptr(Lexer *lexer, const char *start, const char *end)
@@ -69,6 +62,8 @@ static inline SourceRange range_from_ptr(Lexer *lexer, const char *start, const
 	};
 }

+#pragma mark --- Token creation
+
 static Token error_token(Lexer *lexer, const char *message, ...)
 {
 	Token token = {
@@ -108,27 +103,16 @@ static Token make_string_token(Lexer *lexer, TokenType type, const char* string)
 			};
 }

-static inline bool match(Lexer *lexer, char expected)
-{
-	if (reached_end(lexer)) return false;
-	if (*lexer->current != expected) return false;
-	lexer->current++;
-	return true;
-}
-
-static inline void match_assert(Lexer *lexer, char expected)
-{
-	assert(!reached_end(lexer));
-	assert(lexer->current[0] == expected);
-	lexer->current++;
-}
-
+#pragma mark --- Comment parsing

 static inline Token parse_line_comment(Lexer *lexer)
 {
 	// // style comment
 	// Skip forward to the end.
+
+	/// is a doc line comment.
 	TokenType comment_type = match(lexer, '/') ? TOKEN_DOC_COMMENT : TOKEN_COMMENT;
+
 	while (!reached_end(lexer) && peek(lexer) != '\n')
 	{
 		next(lexer);
@@ -207,12 +191,11 @@ static inline Token parse_multiline_comment(Lexer *lexer)
 		next(lexer);
 	}
 }
+
 /**
- * Skip regular comments.
- *
- * @return the result of the skip (did we enter docs? did we have any errors?)
+ * Skip regular whitespace.
 */
-void skip_whitespace(Lexer *lexer)
+static void skip_whitespace(Lexer *lexer)
 {
 	while (1)
 	{
@@ -235,7 +218,7 @@ void skip_whitespace(Lexer *lexer)
 }


-// --- Normal scanning methods start here
+#pragma mark --- Identifier scanning

 static inline Token scan_prefixed_ident(Lexer *lexer, TokenType type, TokenType no_ident_type, bool ends_with_bang, const char *start)
 {
@@ -254,12 +237,6 @@ static inline Token scan_prefixed_ident(Lexer *lexer, TokenType type, TokenType
 	return make_string_token(lexer, type, interned);
 }

-static inline void scan_skipped_ident(Lexer *lexer)
-{
-	while (is_alphanum_(peek(lexer))) next(lexer);
-}
-
-

 // Parses identifiers. Note that this is a bit complicated here since
 // we split identifiers into 2 types + find keywords.
@@ -315,8 +292,7 @@ static inline Token scan_ident(Lexer *lexer)
 	return make_string_token(lexer, type, interned_string);
 }

-
-#pragma mark ----- Number scanning
+#pragma mark --- Number scanning

 static Token scan_oct(Lexer *lexer)
 {
@@ -327,42 +303,18 @@ static Token scan_oct(Lexer *lexer)
 }


-Token scan_binary(Lexer *lexer)
+static Token scan_binary(Lexer *lexer)
 {
-	char b = next(lexer); // Skip the b
+	next(lexer); // Skip the b
 	if (!is_binary(next(lexer)))
 	{
-		return error_token(lexer, "An expression starting with '0%c' would expect a sequence of zeroes and ones, "
-		                   "did you try to write a hex value but forgot the '0x'?", b);
+		return error_token(lexer, "An expression starting with '0b' would expect a sequence of zeroes and ones, "
+		                   "did you try to write a hex value but forgot the '0x'?");
 	}
 	while (is_binary_or_(peek(lexer))) next(lexer);
 	return make_token(lexer, TOKEN_INTEGER, lexer->lexing_start);
 }

-#define PARSE_SPECIAL_NUMBER(is_num, is_num_with_underscore, exp, EXP) \
-while (is_num_with_underscore(peek(lexer))) next(lexer);  \
-bool is_float = false;  \
-if (peek(lexer) == '.')  \
-{ \
-	is_float = true; \
-	next(lexer); \
-	char c = peek(lexer); \
-	if (c == '_') return error_token(lexer, "Can't parse this as a floating point value due to the '_' directly after decimal point."); \
-	if (is_num(c)) next(lexer); \
-	while (is_num_with_underscore(peek(lexer))) next(lexer); \
-} \
-char c = peek(lexer); \
-if (c == (exp) || c == (EXP)) \
-{ \
-	is_float = true; \
-	next(lexer); \
-	char c2 = next(lexer); \
-	if (c2 == '+' || c2 == '-') c2 = next(lexer); \
-	if (!is_num(c2)) return error_token(lexer, "Parsing the floating point exponent failed, because '%c' is not a number.", c2); \
-	while (is_num(peek(lexer))) next(lexer); \
-} \
-if (prev(lexer) == '_') return error_token(lexer, "The number ended with '_', but that character needs to be between, not after, digits."); \
-return make_token(lexer, is_float ? TOKEN_FLOAT : TOKEN_INTEGER, lexer->lexing_start)

 static inline Token scan_hex(Lexer *lexer)
 {
@@ -372,15 +324,58 @@ static inline Token scan_hex(Lexer *lexer)
 		return error_token(lexer, "'0%c' starts a hexadecimal number, "
 					 "but it was followed by '%c' which is not part of a hexadecimal number.", x, prev(lexer));
 	}
-	PARSE_SPECIAL_NUMBER(is_hex, is_hex_or_, 'p', 'P');
+	while (is_hex_or_(peek(lexer))) next(lexer);
+	bool is_float = false;
+	if (peek(lexer) == '.')
+	{
+		is_float = true;
+		next(lexer);
+		char c = peek(lexer);
+		if (c == '_') return error_token(lexer, "Can't parse this as a floating point value due to the '_' directly after decimal point.");
+		if (is_hex(c)) next(lexer);
+		while (is_hex_or_(peek(lexer))) next(lexer);
+	}
+	char c = peek(lexer);
+	if (c == 'p' || c == 'P')
+	{
+		is_float = true;
+		next(lexer);
+		char c2 = next(lexer);
+		if (c2 == '+' || c2 == '-') c2 = next(lexer);
+		if (!is_hex(c2)) return error_token(lexer, "Parsing the floating point exponent failed, because '%c' is not a number.", c2);
+		while (is_hex(peek(lexer))) next(lexer);
+	}
+	if (prev(lexer) == '_') return error_token(lexer, "The number ended with '_', but that character needs to be between, not after, digits.");
+	return make_token(lexer, is_float ? TOKEN_FLOAT : TOKEN_INTEGER, lexer->lexing_start);
 }

 static inline Token scan_dec(Lexer *lexer)
 {
-	PARSE_SPECIAL_NUMBER(is_digit, is_digit_or_, 'e', 'E');
+	while (is_digit_or_(peek(lexer))) next(lexer);
+	bool is_float = false;
+	if (peek(lexer) == '.')
+	{
+		is_float = true;
+		next(lexer);
+		char c = peek(lexer);
+		if (c == '_') return error_token(lexer, "Can't parse this as a floating point value due to the '_' directly after decimal point.");
+		if (is_digit(c)) next(lexer);
+		while (is_digit_or_(peek(lexer))) next(lexer);
+	}
+	char c = peek(lexer);
+	if (c == 'e' || c == 'E')
+	{
+		is_float = true;
+		next(lexer);
+		char c2 = next(lexer);
+		if (c2 == '+' || c2 == '-') c2 = next(lexer);
+		if (!is_digit(c2)) return error_token(lexer, "Parsing the floating point exponent failed, because '%c' is not a number.", c2);
+		while (is_digit(peek(lexer))) next(lexer);
+	}
+	if (prev(lexer) == '_') return error_token(lexer, "The number ended with '_', but that character needs to be between, not after, digits.");
+	return make_token(lexer, is_float ? TOKEN_FLOAT : TOKEN_INTEGER, lexer->lexing_start);
 }

-#undef PARSE_SPECIAL_NUMBER

 static inline Token scan_digit(Lexer *lexer)
 {
@@ -407,8 +402,7 @@ static inline Token scan_digit(Lexer *lexer)
 	return scan_dec(lexer);
 }

-#pragma mark -----
-
+#pragma mark --- Character & string scan

 static inline Token scan_char(Lexer *lexer)
 {
@@ -467,25 +461,7 @@ static inline Token scan_string(Lexer *lexer)
 	return make_token(lexer, TOKEN_STRING, lexer->lexing_start);
 }

-static inline void skip_docs_whitespace(Lexer *lexer)
-{
-	while (1)
-	{
-		char c = peek(lexer);
-		switch (c)
-		{
-			case ' ':
-			case '\t':
-			case '\r':
-			case '\f':
-				next(lexer);
-				break;
-			default:
-				return;
-		}
-	}
-}
-
+#pragma mark --- Lexer public functions

 Token lexer_scan_token(Lexer *lexer)
 {
@@ -597,27 +573,18 @@ File* lexer_current_file(Lexer *lexer)
 	return lexer->current_file;
 }

-void lexer_check_init()
+void lexer_init_with_file(Lexer *lexer, File *file)
 {
-	static bool symtab_has_init = false;
-	if (symtab_has_init) return;
-	symtab_has_init = true;
-	symtab_init(build_options.symtab_size);
-}
-
-
-void lexer_add_file_for_lexing(Lexer *lexer, File *file)
-{
-	lexer_check_init();
 	lexer->current_file = file;
 	lexer->file_begin = lexer->current_file->contents;
 	lexer->lexing_start = lexer->file_begin;
 	lexer->current = lexer->lexing_start;
 }

-void lexer_test_setup(Lexer *lexer, const char *text, size_t len)
+#pragma mark --- Test methods
+
+void lexer_init_for_test(Lexer *lexer, const char *text, size_t len)
 {
-	lexer_check_init();
 	static File helper;
 	lexer->lexing_start = text;
 	lexer->current = text;
@@ -629,8 +596,6 @@ void lexer_test_setup(Lexer *lexer, const char *text, size_t len)
 	lexer->current_file->name = "Test";
 }

-
-
 Token lexer_scan_ident_test(Lexer *lexer, const char *scan)
 {
 	static File helper;
--- a/src/compiler/parser.c
+++ b/src/compiler/parser.c
@@ -2847,7 +2847,7 @@ void parse_current(Context *context)

 void parse_file(Context *context)
 {
-	lexer_add_file_for_lexing(&context->lexer, context->file);
+	lexer_init_with_file(&context->lexer, context->file);
 	parse_current(context);
 }

--- a/src/compiler_tests/tests.c
+++ b/src/compiler_tests/tests.c
@@ -22,7 +22,6 @@ static void test_lexer(void)
 	const char* tokens[TOKEN_EOF];
 	int len[TOKEN_EOF];
 	Lexer lexer;
-	lexer_check_init();
 	for (int i = 1; i < TOKEN_EOF; i++)
 	{
 		const char* token = token_type_to_string((TokenType)i);
@@ -78,7 +77,7 @@ static void test_lexer(void)
 	size_t test_len = strlen(test_parse);
 	for (int b = 0; b < BENCH_REPEATS; b++)
 	{
-		lexer_test_setup(&lexer, test_parse, test_len);
+		lexer_init_for_test(&lexer, test_parse, test_len);
 		Token token;
 		while (1)
 		{
--- a/src/main.c
+++ b/src/main.c
@@ -7,8 +7,15 @@

 int main(int argc, const char *argv[])
 {
-	init_arena();
+	// First setup memory
+	memory_init();
+
+	// Parse arguments.
 	parse_arguments(argc, argv);
+
+	// Now we set up the symtab.
+	symtab_init(build_options.symtab_size);
+
 	switch (build_options.command)
 	{
 		case COMMAND_INIT:
--- a/src/utils/lib.h
+++ b/src/utils/lib.h
@@ -13,7 +13,7 @@ void path_get_dir_and_filename_from_full(const char *full_path, char **filename,
 void file_find_top_dir();
 void file_add_wildcard_files(const char ***files, const char *path, bool recursive);

-void init_arena(void);
+void memory_init(void);
 void *malloc_arena(unsigned long mem);
 void free_arena(void);

@@ -74,6 +74,7 @@ static inline bool is_binary(char c)
 	return c  == '0' || c == '1';
 }

+
 static inline bool is_binary_or_(char c)
 {
 	switch (c)
--- a/src/utils/malloc.c
+++ b/src/utils/malloc.c
@@ -16,7 +16,7 @@ static size_t arena_buckets_array_size;
 static size_t current_use;
 static void *current_arena;
 static int allocations_done;
-void init_arena(void)
+void memory_init(void)
 {
 	arena_buckets = malloc(STARTING_ARENA_BUCKETS * sizeof(void *));
 	arena_buckets_used = 1;
@@ -91,9 +91,9 @@ void run_arena_allocator_tests(void)
 {
 	printf("Begin arena allocator testing.\n");
 	bool was_init = arena_buckets != NULL;
-	if (!was_init) init_arena();
+	if (!was_init) memory_init();
 	free_arena();
-	init_arena();
+	memory_init();
 	ASSERT(malloc_arena(10) != malloc_arena(10), "Expected different values...");
 	printf("-- Tested basic allocation - OK.\n");
 	ASSERT(current_use == 32, "Expected allocations rounded to next 16 bytes");
@@ -122,5 +122,5 @@ void run_arena_allocator_tests(void)
 	free_arena();
 	ASSERT(arena_buckets_array_size == 0, "Arena not freed?");
 	printf("-- Test freeing arena - OK.\n");
-	if (was_init) init_arena();
+	if (was_init) memory_init();
 }