Add a little testing and keyword parsing x2

2026-02-27 03:51:18 +00:00 · 2019-07-25 18:57:35 +02:00
parent e229d19b7c
commit 7439dccc53
11 changed files with 1129 additions and 2 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.13)
 project(c3c C)
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")

 set(CMAKE_C_STANDARD 11)

@@ -14,4 +15,4 @@ add_executable(c3c
        src/utils/errors.c
        src/utils/file_utils.c
        src/utils/string_utils.c
-        )
+        src/compiler/lexer.c src/compiler/lexer.h src/compiler_tests/tests.c src/compiler_tests/tests.h src/compiler_tests/benchmark.c src/compiler_tests/benchmark.h)
--- a/src/build/build_options.c
+++ b/src/build/build_options.c
@@ -128,6 +128,11 @@ static void parse_command(void)
 		build_options.project_name = next_arg();
 		return;
 	}
+	if (arg_match("utest"))
+	{
+		build_options.command = COMMAND_UNIT_TEST;
+		return;
+	}
 	if (arg_match("compile"))
 	{
 		build_options.command = COMMAND_COMPILE;
--- a/src/build/build_options.h
+++ b/src/build/build_options.h
@@ -20,6 +20,7 @@ typedef enum
 	COMMAND_DIST,
 	COMMAND_DOCS,
 	COMMAND_BENCH,
+	COMMAND_UNIT_TEST,
 } CompilerCommand;

 typedef struct
--- a/src/compiler/lexer.c
+++ b/src/compiler/lexer.c
@@ -0,0 +1,806 @@
+// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include "lexer.h"
+#include "../utils/string_utils.h"
+#include <string.h>
+#include "../utils/errors.h"
+
+typedef struct
+{
+	const char *begin;
+	const char *start;
+	const char *current;
+	uint16_t source_file;
+/*	LexerState lexer_state;
+	File *current_file;
+	Token saved_tok;
+	Token saved_prev_tok;
+	SourceLoc last_in_range;*/
+} Lexer;
+
+
+Lexer lexer;
+
+
+#define MATCH_KEYWORD_LEN(_keyword, _type) \
+  ((sizeof(_keyword) != len + 1) ? TOKEN_VAR_IDENT : check_keyword(start, len, _keyword, _type))
+
+#define MATCH_KEYWORD(_keyword, _type) check_keyword(start, len, _keyword, _type)
+
+// Yes this is an ugly hand written keyword identifier. It should be benchmarked against
+// an table based state machine.
+
+static inline TokenType check_keyword(const char * restrict start, size_t len, const char * restrict keyword, TokenType type)
+{
+	if (memcmp(start + 1, keyword + 1, len - 1) == 0)
+	{
+		return type;
+	}
+	return TOKEN_VAR_IDENT;
+}
+
+
+
+static inline TokenType c_ident(const char *restrict start, const int len)
+{
+	switch (start[3])
+	{
+		case 'n':
+			return MATCH_KEYWORD_LEN("c_int", TOKEN_C_INT);
+		case 'i':
+			return MATCH_KEYWORD_LEN("c_uint", TOKEN_C_UINT);
+		case 's':
+			return MATCH_KEYWORD_LEN("c_ushort", TOKEN_C_USHORT);
+		case 'h':
+			return MATCH_KEYWORD_LEN("c_short", TOKEN_C_SHORT);
+		case 'o':
+			switch (len)
+			{
+				case 10:
+					return MATCH_KEYWORD("c_longlong", TOKEN_C_LONGLONG);
+				case 6:
+					return MATCH_KEYWORD("c_long", TOKEN_C_LONG);
+				case 12:
+					return MATCH_KEYWORD("c_longdouble", TOKEN_C_LONGDOUBLE);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'l':
+			return len == 11
+			       ? MATCH_KEYWORD("c_ulonglong", TOKEN_C_ULONGLONG)
+			       : MATCH_KEYWORD_LEN("c_ulong", TOKEN_C_ULONG);
+		default:
+			return TOKEN_VAR_IDENT;
+	}
+}
+static inline TokenType ident_type(const char *restrict start, const int len)
+{
+	char current_value = start[0];
+	if (len < 2) return TOKEN_VAR_IDENT;
+	if (current_value == 'c' && start[1] == '_') return c_ident(start, len);
+	if (len > 8 || !is_lower(current_value)) return TOKEN_VAR_IDENT;
+	switch (current_value)
+	{
+		case 'a':
+			if (len == 2) return MATCH_KEYWORD("as", TOKEN_AS);
+			switch (start[1])
+			{
+				case 's':
+					return MATCH_KEYWORD_LEN("asm", TOKEN_ASM);
+				case 'l':
+					return MATCH_KEYWORD_LEN("alias", TOKEN_ALIAS);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'b':
+			switch (start[1])
+			{
+				case 'o':
+					return MATCH_KEYWORD_LEN("bool", TOKEN_BOOL);
+				case 'y':
+					return MATCH_KEYWORD_LEN("byte", TOKEN_BYTE);
+				case 'r':
+					return MATCH_KEYWORD_LEN("break", TOKEN_BREAK);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'c':
+			if (len < 4) return TOKEN_VAR_IDENT;
+			if (len == 8) return MATCH_KEYWORD_LEN("continue", TOKEN_CONTINUE);
+			switch (start[3])
+			{
+				case 't':
+					return MATCH_KEYWORD_LEN("cast", TOKEN_CAST);
+				case 'e':
+					return MATCH_KEYWORD_LEN("case", TOKEN_CASE);
+				case 'r':
+					return MATCH_KEYWORD_LEN("char", TOKEN_CHAR);
+				case 's':
+					return MATCH_KEYWORD_LEN("const", TOKEN_CONST);
+				case 'c':
+					return MATCH_KEYWORD_LEN("catch", TOKEN_CATCH);
+				default:
+					return TOKEN_VAR_IDENT;
+
+			}
+		case 'd':
+			if (len < 5) return MATCH_KEYWORD_LEN("do", TOKEN_DO);
+			switch (start[3])
+			{
+				case 'e':
+					return MATCH_KEYWORD_LEN("defer", TOKEN_DEFER);
+				case 'a':
+					return MATCH_KEYWORD_LEN("default", TOKEN_DEFAULT);
+				case 'b':
+					return MATCH_KEYWORD_LEN("double", TOKEN_DOUBLE);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'e':
+			switch (start[1])
+			{
+				case 'l':
+					return MATCH_KEYWORD_LEN("else", TOKEN_ELSE);
+				case 'n':
+					return MATCH_KEYWORD_LEN("enum", TOKEN_ENUM);
+				case 'r':
+					return MATCH_KEYWORD_LEN("error", TOKEN_ERROR);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'f':
+			switch (start[2])
+			{
+				case '6':
+					return MATCH_KEYWORD_LEN("f16", TOKEN_F16);
+				case '2':
+					if (len == 4) return MATCH_KEYWORD_LEN("f128", TOKEN_F128);
+					return MATCH_KEYWORD_LEN("f32", TOKEN_F32);
+				case '4':
+					return MATCH_KEYWORD_LEN("f64", TOKEN_F64);
+				case '5':
+					return MATCH_KEYWORD_LEN("f256", TOKEN_F256);
+				case 'r':
+					return MATCH_KEYWORD_LEN("for", TOKEN_FOR);
+				case 'l':
+					return MATCH_KEYWORD_LEN("false", TOKEN_FALSE);
+				case 'o':
+					return MATCH_KEYWORD_LEN("float", TOKEN_FLOAT);
+				case 'n':
+					return MATCH_KEYWORD_LEN("func", TOKEN_FUNC);
+
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'g':
+			switch (start[1])
+			{
+				case 'o':
+					return MATCH_KEYWORD_LEN("goto", TOKEN_GOTO);
+				case 'e':
+					return MATCH_KEYWORD_LEN("generic", TOKEN_GENERIC);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'h':
+			return MATCH_KEYWORD_LEN("half", TOKEN_HALF);
+		case 'i':
+			switch (start[1])
+			{
+				case 'f':
+					return MATCH_KEYWORD_LEN("if", TOKEN_IF);
+				case '8':
+					return MATCH_KEYWORD_LEN("i8", TOKEN_I8);
+				case '6':
+					return MATCH_KEYWORD_LEN("i64", TOKEN_I64);
+				case '2':
+					return MATCH_KEYWORD_LEN("i256", TOKEN_I256);
+				case '3':
+					return MATCH_KEYWORD_LEN("i32", TOKEN_I32);
+				case '1':
+					if (len == 4) return MATCH_KEYWORD_LEN("i128", TOKEN_I128);
+					return MATCH_KEYWORD_LEN("i16", TOKEN_I16);
+				case 'n':
+					return MATCH_KEYWORD_LEN("int", TOKEN_INT);
+				case 'm':
+					return MATCH_KEYWORD_LEN("import", TOKEN_IMPORT);
+				case 's':
+					return MATCH_KEYWORD_LEN("isize", TOKEN_ISIZE);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'l':
+			if (len < 4) return TOKEN_VAR_IDENT;
+			switch (start[2])
+			{
+				case 'n':
+					return MATCH_KEYWORD_LEN("long", TOKEN_LONG);
+				case 'c':
+					return MATCH_KEYWORD_LEN("local", TOKEN_LOCAL);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'm':
+			switch (start[1])
+			{
+				case 'a':
+					return MATCH_KEYWORD_LEN("macro", TOKEN_MACRO);
+				case 'o':
+					return MATCH_KEYWORD_LEN("module", TOKEN_MODULE);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'n':
+			return MATCH_KEYWORD_LEN("nil", TOKEN_NIL);
+		case 'p':
+			return MATCH_KEYWORD_LEN("public", TOKEN_PUBLIC);
+		case 'q':
+			return MATCH_KEYWORD_LEN("quad", TOKEN_QUAD);
+		case 'r':
+			return MATCH_KEYWORD_LEN("return", TOKEN_RETURN);
+		case 's':
+			switch (start[1])
+			{
+				case 'h':
+					return MATCH_KEYWORD_LEN("short", TOKEN_SHORT);
+				case 't':
+					return MATCH_KEYWORD_LEN("struct", TOKEN_STRUCT);
+				case 'w':
+					return MATCH_KEYWORD_LEN("switch", TOKEN_SWITCH);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 't':
+			if (len < 3) return TOKEN_VAR_IDENT;
+			switch (start[2])
+			{
+				case 'p':
+					if (len == 7) return MATCH_KEYWORD_LEN("typedef", TOKEN_TYPEDEF);
+					return MATCH_KEYWORD_LEN("type", TOKEN_TYPE);
+				case 'u':
+					return MATCH_KEYWORD_LEN("true", TOKEN_TRUE);
+				case 'y':
+					return MATCH_KEYWORD_LEN("try", TOKEN_TRY);
+				case 'r':
+					if (len == 6) return MATCH_KEYWORD_LEN("throws", TOKEN_THROWS);
+					return MATCH_KEYWORD_LEN("throw", TOKEN_THROW);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'u':
+			if (len < 3) return MATCH_KEYWORD_LEN("u8", TOKEN_U8);
+			switch (start[1])
+			{
+				case '1':
+					if (len == 4) return MATCH_KEYWORD("u128", TOKEN_U128);
+					return MATCH_KEYWORD_LEN("u16", TOKEN_U16);
+				case '2':
+					return MATCH_KEYWORD_LEN("u256", TOKEN_U256);
+				case '3':
+					return MATCH_KEYWORD_LEN("u32", TOKEN_U32);
+				case '6':
+					return MATCH_KEYWORD_LEN("u64", TOKEN_U64);
+				case 'i':
+					return MATCH_KEYWORD_LEN("uint", TOKEN_UINT);
+				case 'n':
+					if (start[2] == 't') return MATCH_KEYWORD_LEN("until", TOKEN_UNTIL);
+					return MATCH_KEYWORD_LEN("union", TOKEN_UNION);
+				case 'l':
+					return MATCH_KEYWORD_LEN("ulong", TOKEN_ULONG);
+				case 's':
+					if (len == 5) return MATCH_KEYWORD("usize", TOKEN_USIZE);
+					return MATCH_KEYWORD_LEN("ushort", TOKEN_USHORT);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'v':
+			if (len < 3) return TOKEN_VAR_IDENT;
+			switch (start[2])
+			{
+				case 'r':
+					return MATCH_KEYWORD_LEN("var", TOKEN_VAR);
+				case 'i':
+					return MATCH_KEYWORD_LEN("void", TOKEN_VOID);
+				case 'l':
+					return MATCH_KEYWORD_LEN("volatile", TOKEN_VOLATILE);
+				default:
+					return TOKEN_VAR_IDENT;
+			}
+		case 'w':
+			return MATCH_KEYWORD_LEN("while", TOKEN_WHILE);
+		default:
+			return TOKEN_VAR_IDENT;
+	}
+}
+
+#define PRIME 0x01000193
+#define SEED 0x811C9DC5
+
+#define FNV1(a, seed) ((uint32_t)((((unsigned int)(a)) ^ (seed)) * PRIME))
+#define HASH(a, b, c) (FNV1(c, FNV1((a), FNV1(b, SEED))) & 0x1FFu)
+
+TokenType ident_type_fnv1(const char *restrict start, int len)
+{
+	char current_value = start[0];
+	if (len < 2) return TOKEN_VAR_IDENT;
+	char second = start[1];
+	if (current_value == 'c' && second == '_') return c_ident(start, len);
+	if (len > 8 || !is_lower(current_value)) return TOKEN_VAR_IDENT;
+	switch (HASH(len, current_value, second))
+	{
+		case HASH(2, 'a', 's'):
+			return MATCH_KEYWORD_LEN("as", TOKEN_AS);
+		case HASH(3, 'a', 's'):
+			return MATCH_KEYWORD_LEN("asm", TOKEN_ASM);
+		case HASH(5, 'a', 'l'):
+			return MATCH_KEYWORD_LEN("alias", TOKEN_ALIAS);
+		case HASH(4, 'b', 'o'):
+			return MATCH_KEYWORD_LEN("bool", TOKEN_BOOL);
+		case HASH(4, 'b', 'y'):
+			return MATCH_KEYWORD_LEN("byte", TOKEN_BYTE);
+		case HASH(5, 'b', 'r'):
+			return MATCH_KEYWORD_LEN("break", TOKEN_BREAK);
+		case HASH(8, 'c', 'o'):
+			return MATCH_KEYWORD_LEN("continue", TOKEN_CONTINUE);
+		case HASH(4, 'c', 'a'):
+			return len > 3 && start[3] == 't' ? MATCH_KEYWORD_LEN("cast", TOKEN_CAST) : MATCH_KEYWORD_LEN("case", TOKEN_CASE);
+		case HASH(5, 'c', '_'):
+			return MATCH_KEYWORD_LEN("c_int", TOKEN_C_INT);
+		case HASH(6, 'c', '_'):
+			return len > 3 && start[2] == 'u' ? MATCH_KEYWORD_LEN("c_uint", TOKEN_C_UINT) : MATCH_KEYWORD_LEN("c_long", TOKEN_C_LONG);
+		case HASH(7, 'c', '_'):
+			return MATCH_KEYWORD_LEN("c_short", TOKEN_C_SHORT);
+		case HASH(8, 'c', '_'):
+			return MATCH_KEYWORD_LEN("c_ushort", TOKEN_C_USHORT);
+		case HASH(4, 'c', 'h'):
+			return MATCH_KEYWORD_LEN("char", TOKEN_CHAR);
+		case HASH(5, 'c', 'o'):
+			return MATCH_KEYWORD_LEN("const", TOKEN_CONST);
+		case HASH(5, 'c', 'a'):
+			return MATCH_KEYWORD_LEN("catch", TOKEN_CATCH);
+		case HASH(2, 'd', 'o'):
+			return MATCH_KEYWORD_LEN("do", TOKEN_DO);
+		case HASH(5, 'd', 'e'):
+			return MATCH_KEYWORD_LEN("defer", TOKEN_DEFER);
+		case HASH(7, 'd', 'e'):
+			return MATCH_KEYWORD_LEN("default", TOKEN_DEFAULT);
+		case HASH(6, 'd', 'o'):
+			return MATCH_KEYWORD_LEN("double", TOKEN_DOUBLE);
+		case HASH(4, 'e', 'l'):
+			return MATCH_KEYWORD_LEN("else", TOKEN_ELSE);
+		case HASH(4, 'e', 'n'):
+			return MATCH_KEYWORD_LEN("enum", TOKEN_ENUM);
+		case HASH(5, 'e', 'r'):
+			return MATCH_KEYWORD_LEN("error", TOKEN_ERROR);
+		case HASH(3, 'f', '1'):
+			return MATCH_KEYWORD_LEN("f16", TOKEN_F16);
+		case HASH(4, 'f', '1'):
+			return MATCH_KEYWORD_LEN("f128", TOKEN_F128);
+		case HASH(3, 'f', '3'):
+			return MATCH_KEYWORD_LEN("f32", TOKEN_F32);
+		case HASH(3, 'f', '6'):
+			return MATCH_KEYWORD_LEN("f64", TOKEN_F64);
+		case HASH(4, 'f', '2'):
+			return MATCH_KEYWORD_LEN("f256", TOKEN_F256);
+		case HASH(3, 'f', 'o'):
+			return MATCH_KEYWORD_LEN("for", TOKEN_FOR);
+		case HASH(5, 'f', 'a'):
+			return MATCH_KEYWORD_LEN("false", TOKEN_FALSE);
+		case HASH(5, 'f', 'l'):
+			return MATCH_KEYWORD_LEN("float", TOKEN_FLOAT);
+		case HASH(4, 'f', 'u'):
+			return MATCH_KEYWORD_LEN("func", TOKEN_FUNC);
+		case HASH(4, 'g', 'o'):
+			return MATCH_KEYWORD_LEN("goto", TOKEN_GOTO);
+		case HASH(7, 'g', 'e'):
+			return MATCH_KEYWORD_LEN("generic", TOKEN_GENERIC);
+		case HASH(4, 'h', 'a'):
+			return MATCH_KEYWORD_LEN("half", TOKEN_HALF);
+		case HASH(2, 'i', 'f'):
+			return MATCH_KEYWORD_LEN("if", TOKEN_IF);
+		case HASH(2, 'i', '8'):
+			return MATCH_KEYWORD_LEN("i8", TOKEN_I8);
+		case HASH(3, 'i', '6'):
+			return MATCH_KEYWORD_LEN("i64", TOKEN_I64);
+		case HASH(4, 'i', '2'):
+			return MATCH_KEYWORD_LEN("i256", TOKEN_I256);
+		case HASH(3, 'i', '3'):
+			return MATCH_KEYWORD_LEN("i32", TOKEN_I32);
+		case HASH(4, 'i', '1'):
+			return MATCH_KEYWORD_LEN("i128", TOKEN_I128);
+		case HASH(3, 'i', '1'):
+			return MATCH_KEYWORD_LEN("i16", TOKEN_I16);
+		case HASH(3, 'i', 'n'):
+			return MATCH_KEYWORD_LEN("int", TOKEN_INT);
+		case HASH(6, 'i', 'm'):
+			return MATCH_KEYWORD_LEN("import", TOKEN_IMPORT);
+		case HASH(5, 'i', 's'):
+			return MATCH_KEYWORD_LEN("isize", TOKEN_ISIZE);
+		case HASH(4, 'l', 'o'):
+			return MATCH_KEYWORD_LEN("long", TOKEN_LONG);
+		case HASH(5, 'l', 'o'):
+			return MATCH_KEYWORD_LEN("local", TOKEN_LOCAL);
+		case HASH(5, 'm', 'a'):
+			return MATCH_KEYWORD_LEN("macro", TOKEN_MACRO);
+		case HASH(6, 'm', 'o'):
+			return MATCH_KEYWORD_LEN("module", TOKEN_MODULE);
+		case HASH(3, 'n', 'i'):
+			return MATCH_KEYWORD_LEN("nil", TOKEN_NIL);
+		case HASH(6, 'p', 'u'):
+			return MATCH_KEYWORD_LEN("public", TOKEN_PUBLIC);
+		case HASH(4, 'q', 'u'):
+			return MATCH_KEYWORD_LEN("quad", TOKEN_QUAD);
+		case HASH(6, 'r', 'e'):
+			return MATCH_KEYWORD_LEN("return", TOKEN_RETURN);
+		case HASH(5, 's', 'h'):
+			return MATCH_KEYWORD_LEN("short", TOKEN_SHORT);
+		case HASH(6, 's', 't'):
+			return MATCH_KEYWORD_LEN("struct", TOKEN_STRUCT);
+		case HASH(6, 's', 'w'):
+			return MATCH_KEYWORD_LEN("switch", TOKEN_SWITCH);
+		case HASH(7, 't', 'y'):
+			return MATCH_KEYWORD_LEN("typedef", TOKEN_TYPEDEF);
+		case HASH(4, 't', 'y'):
+			return MATCH_KEYWORD_LEN("type", TOKEN_TYPE);
+		case HASH(4, 't', 'r'):
+			return MATCH_KEYWORD_LEN("true", TOKEN_TRUE);
+		case HASH(3, 't', 'r'):
+			return MATCH_KEYWORD_LEN("try", TOKEN_TRY);
+		case HASH(6, 't', 'h'):
+			return MATCH_KEYWORD_LEN("throws", TOKEN_THROWS);
+		case HASH(5, 't', 'h'):
+			return MATCH_KEYWORD_LEN("throw", TOKEN_THROW);
+		case HASH(2, 'u', '8'):
+			return MATCH_KEYWORD_LEN("u8", TOKEN_U8);
+		case HASH(4, 'u', '1'):
+			return MATCH_KEYWORD_LEN("u128", TOKEN_U128);
+		case HASH(3, 'u', '1'):
+			return MATCH_KEYWORD_LEN("u16", TOKEN_U16);
+		case HASH(4, 'u', '2'):
+			return MATCH_KEYWORD_LEN("u256", TOKEN_U256);
+		case HASH(3, 'u', '3'):
+			return MATCH_KEYWORD_LEN("u32", TOKEN_U32);
+		case HASH(3, 'u', '6'):
+			return MATCH_KEYWORD_LEN("u64", TOKEN_U64);
+		case HASH(4, 'u', 'i'):
+			return MATCH_KEYWORD_LEN("uint", TOKEN_UINT);
+		case HASH(5, 'u', 'n'):
+			if (start[2] == 't') return MATCH_KEYWORD_LEN("until", TOKEN_UNTIL);
+			return MATCH_KEYWORD_LEN("union", TOKEN_UNION);
+		case HASH(5, 'u', 'l'):
+			return MATCH_KEYWORD_LEN("ulong", TOKEN_ULONG);
+		case HASH(5, 'u', 's'):
+			return MATCH_KEYWORD_LEN("usize", TOKEN_USIZE);
+		case HASH(6, 'u', 's'):
+			return MATCH_KEYWORD_LEN("ushort", TOKEN_USHORT);
+		case HASH(3, 'v', 'a'):
+			return MATCH_KEYWORD_LEN("var", TOKEN_VAR);
+		case HASH(4, 'v', 'o'):
+			return MATCH_KEYWORD_LEN("void", TOKEN_VOID);
+		case HASH(8, 'v', 'o'):
+			return MATCH_KEYWORD_LEN("volatile", TOKEN_VOLATILE);
+		case HASH(5, 'w', 'h'):
+			return MATCH_KEYWORD_LEN("while", TOKEN_WHILE);
+		default:
+			return TOKEN_VAR_IDENT;
+	}
+}
+
+
+#undef HASH4V
+#undef HASH4
+#undef HASH3V
+#undef HASH3
+#undef HASH2V
+#undef HASH2
+#undef HASH
+
+#undef MATCH_KEYWORD
+
+TokenType identifier_type(const char* restrict start, int len)
+{
+	return ident_type(start, len);
+}
+
+const char *token_type_to_string(TokenType type)
+{
+	switch (type)
+	{
+		case TOKEN_LPAREN:
+			return "(";
+		case TOKEN_RPAREN:
+			return ")";
+		case TOKEN_LBRACE:
+			return "{";
+		case TOKEN_RBRACE:
+			return "}";
+		case TOKEN_LBRACKET:
+			return "[";
+		case TOKEN_RBRACKET:
+			return "]";
+		case TOKEN_COMMA:
+			return ",";
+		case TOKEN_DOT:
+			return ".";
+		case TOKEN_EOS:
+			return ";";
+		case TOKEN_PLUS:
+			return "+";
+		case TOKEN_PLUSPLUS:
+			return "++";
+		case TOKEN_PLUS_ASSIGN:
+			return "+=";
+		case TOKEN_BIT_NOT:
+			return "~";
+		case TOKEN_NOT:
+			return "!";
+		case TOKEN_MINUS:
+			return "-";
+		case TOKEN_MINUSMINUS:
+			return "--";
+		case TOKEN_MINUS_ASSIGN:
+			return "-=";
+		case TOKEN_STAR:
+			return "*";
+		case TOKEN_MULT_ASSIGN:
+			return "*=";
+		case TOKEN_MOD:
+			return "%";
+		case TOKEN_MOD_ASSIGN:
+			return "%=";
+		case TOKEN_DIV:
+			return "/";
+		case TOKEN_DIV_ASSIGN:
+			return "/=";
+		case TOKEN_NOT_EQUAL:
+			return "!=";
+		case TOKEN_EQ:
+			return "=";
+		case TOKEN_EQEQ:
+			return "==";
+		case TOKEN_COLON:
+			return ":";
+		case TOKEN_COLCOLON:
+			return "::";
+		case TOKEN_DOTDOT:
+			return "..";
+		case TOKEN_ELIPSIS:
+			return "...";
+		case TOKEN_GREATER:
+			return ">";
+		case TOKEN_GREATER_EQ:
+			return ">=";
+		case TOKEN_RIGHT_SHIFT:
+			return ">>";
+		case TOKEN_RIGHT_SHIFT_ASSIGN:
+			return ">>=";
+		case TOKEN_LESS:
+			return "<";
+		case TOKEN_LESS_EQ:
+			return "<=";
+		case TOKEN_LEFT_SHIFT:
+			return "<<";
+		case TOKEN_LEFT_SHIFT_ASSIGN:
+			return "<<=";
+		case TOKEN_ARROW:
+			return "->";
+		case TOKEN_AND:
+			return "&&";
+		case TOKEN_AND_ASSIGN:
+			return "&&=";
+		case TOKEN_AMP:
+			return "&";
+		case TOKEN_BIT_AND_ASSIGN:
+			return "&=";
+		case TOKEN_OR:
+			return "||";
+		case TOKEN_OR_ASSIGN:
+			return "||=";
+		case TOKEN_BIT_OR:
+			return "|";
+		case TOKEN_BIT_OR_ASSIGN:
+			return "|=";
+		case TOKEN_BIT_XOR:
+			return "^";
+		case TOKEN_BIT_XOR_ASSIGN:
+			return "^=";
+		case TOKEN_VAR_IDENT:
+			return "<varIdent>";
+		case TOKEN_TYPE_IDENT:
+			return "<TypeIdent>";
+		case TOKEN_STRING:
+			return "<string>";
+		case TOKEN_INTEGER:
+			return "<int>";
+		case TOKEN_REAL:
+			return "<float>";
+		case TOKEN_QUESTION:
+			return "?";
+		case TOKEN_ELVIS:
+			return "?:";
+		case TOKEN_VOID:
+			return "void";
+		case TOKEN_ALIAS:
+			return "alias";
+		case TOKEN_CONST:
+			return "const";
+		case TOKEN_VOLATILE:
+			return "volatile";
+		case TOKEN_ELSE:
+			return "else";
+		case TOKEN_FALSE:
+			return "false";
+		case TOKEN_CONTINUE:
+			return "continue";
+		case TOKEN_FUNC:
+			return "func";
+		case TOKEN_FOR:
+			return "for";
+		case TOKEN_IMPORT:
+			return "import";
+		case TOKEN_MODULE:
+			return "module";
+		case TOKEN_IF:
+			return "if";
+		case TOKEN_NIL:
+			return "nil";
+		case TOKEN_RETURN:
+			return "return";
+		case TOKEN_GOTO:
+			return "goto";
+		case TOKEN_DEFER:
+			return "defer";
+		case TOKEN_TRUE:
+			return "true";
+		case TOKEN_WHILE:
+			return "while";
+		case TOKEN_CASE:
+			return "case";
+		case TOKEN_ASM:
+			return "asm";
+		case TOKEN_DEFAULT:
+			return "default";
+		case TOKEN_SWITCH:
+			return "switch";
+		case TOKEN_UNTIL:
+			return "until";
+		case TOKEN_BREAK:
+			return "break";
+		case TOKEN_TYPE:
+			return "type";
+		case TOKEN_DO:
+			return "do";
+		case TOKEN_PUBLIC:
+			return "public";
+		case TOKEN_LOCAL:
+			return "local";
+		case TOKEN_STRUCT:
+			return "struct";
+		case TOKEN_UNION:
+			return "union";
+		case TOKEN_ENUM:
+			return "enum";
+		case TOKEN_AT:
+			return "@";
+		case TOKEN_AS:
+			return "as";
+		case TOKEN_ERROR:
+			return "<error>";
+		case TOKEN_EOF:
+			return "<eof>";
+		case TOKEN_CAST:
+			return "cast";
+		case TOKEN_C_LONGDOUBLE:
+			return "c_longdouble";
+		case TOKEN_C_USHORT:
+			return "c_ushort";
+		case TOKEN_C_UINT:
+			return "c_uint";
+		case TOKEN_C_ULONG:
+			return "c_ulong";
+		case TOKEN_C_ULONGLONG:
+			return "c_ulonglong";
+		case TOKEN_C_SHORT:
+			return "c_ishort";
+		case TOKEN_C_INT:
+			return "c_int";
+		case TOKEN_C_LONG:
+			return "c_long";
+		case TOKEN_C_LONGLONG:
+			return "c_longlong";
+		case TOKEN_MACRO:
+			return "macro";
+		case TOKEN_F256:
+			return "f256";
+		case TOKEN_I256:
+			return "i256";
+		case TOKEN_U256:
+			return "u256";
+		case TOKEN_F128:
+			return "f128";
+		case TOKEN_I128:
+			return "i128";
+		case TOKEN_U128:
+			return "u128";
+		case TOKEN_F64:
+			return "f64";
+		case TOKEN_I64:
+			return "i64";
+		case TOKEN_U64:
+			return "u64";
+		case TOKEN_F32:
+			return "f32";
+		case TOKEN_I32:
+			return "i32";
+		case TOKEN_U32:
+			return "u32";
+		case TOKEN_F16:
+			return "f16";
+		case TOKEN_I16:
+			return "i16";
+		case TOKEN_U16:
+			return "u16";
+		case TOKEN_I8:
+			return "i8";
+		case TOKEN_U8:
+			return "u8";
+		case TOKEN_BOOL:
+			return "bool";
+		case TOKEN_QUAD:
+			return "quad";
+		case TOKEN_DOUBLE:
+			return "double";
+		case TOKEN_FLOAT:
+			return "float";
+		case TOKEN_LONG:
+			return "long";
+		case TOKEN_ULONG:
+			return "ulong";
+		case TOKEN_INT:
+			return "int";
+		case TOKEN_UINT:
+			return "uint";
+		case TOKEN_SHORT:
+			return "short";
+		case TOKEN_USHORT:
+			return "ushort";
+		case TOKEN_BYTE:
+			return "byte";
+		case TOKEN_CHAR:
+			return "char";
+		case TOKEN_ISIZE:
+			return "isize";
+		case TOKEN_USIZE:
+			return "usize";
+		case TOKEN_CAPS_IDENT:
+			return "<CAPS_IDENT>";
+		case TOKEN_AT_IDENT:
+			return "<@ident>";
+		case TOKEN_HASH_IDENT:
+			return "<#ident>";
+		case TOKEN_DOLLAR_IDENT:
+			return "<$ident>";
+		case TOKEN_CATCH:
+			return "catch";
+		case TOKEN_GENERIC:
+			return "generic";
+		case TOKEN_THROW:
+			return "throw";
+		case TOKEN_THROWS:
+			return "throws";
+		case TOKEN_TRY:
+			return "try";
+		case TOKEN_TYPEDEF:
+			return "typedef";
+		case TOKEN_VAR:
+			return "var";
+		case TOKEN_HALF:
+			return "half";
+		case INVALID_TOKEN:
+			return "<\?\?\?>";
+	}
+	UNREACHABLE
+}
--- a/src/compiler/lexer.h
+++ b/src/compiler/lexer.h
@@ -0,0 +1,182 @@
+#pragma once
+
+// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+typedef enum _TokenType
+{
+	// Single-character tokens.
+	TOKEN_LPAREN,
+	TOKEN_RPAREN,
+	TOKEN_LBRACE,
+	TOKEN_RBRACE,
+	TOKEN_LBRACKET,
+	TOKEN_RBRACKET,
+	TOKEN_COMMA,
+	TOKEN_DOT,
+	TOKEN_EOS,
+	TOKEN_AT,
+
+	// One or two character tokens.
+	TOKEN_PLUS,
+	TOKEN_PLUSPLUS,
+	TOKEN_PLUS_ASSIGN,
+	TOKEN_BIT_NOT,
+	TOKEN_NOT,
+	TOKEN_MINUS,
+	TOKEN_MINUSMINUS,
+	TOKEN_MINUS_ASSIGN,
+	TOKEN_STAR,
+	TOKEN_MULT_ASSIGN,
+	TOKEN_DIV,
+	TOKEN_DIV_ASSIGN,
+	TOKEN_MOD,
+	TOKEN_MOD_ASSIGN,
+	TOKEN_NOT_EQUAL,
+	TOKEN_EQ,
+	TOKEN_EQEQ,
+	TOKEN_COLON,
+	TOKEN_COLCOLON, // Not used but reserved
+	TOKEN_DOTDOT,
+	TOKEN_QUESTION,
+
+	// Three or more
+	TOKEN_ELIPSIS,
+	TOKEN_GREATER,
+	TOKEN_GREATER_EQ,
+	TOKEN_RIGHT_SHIFT,
+	TOKEN_RIGHT_SHIFT_ASSIGN,
+	TOKEN_LESS,
+	TOKEN_LESS_EQ,
+	TOKEN_LEFT_SHIFT,
+	TOKEN_LEFT_SHIFT_ASSIGN,
+	TOKEN_ARROW, // Not used but reserved
+	TOKEN_AND,
+	TOKEN_AND_ASSIGN,
+	TOKEN_AMP,
+	TOKEN_BIT_AND_ASSIGN,
+	TOKEN_OR,
+	TOKEN_OR_ASSIGN,
+	TOKEN_BIT_OR,
+	TOKEN_BIT_OR_ASSIGN,
+	TOKEN_BIT_XOR,
+	TOKEN_BIT_XOR_ASSIGN,
+	TOKEN_ELVIS,
+
+	TOKEN_F256,
+	TOKEN_I256,
+	TOKEN_U256,
+	TOKEN_F128,
+	TOKEN_I128,
+	TOKEN_U128,
+	TOKEN_F64,
+	TOKEN_I64,
+	TOKEN_U64,
+	TOKEN_F32,
+	TOKEN_I32,
+	TOKEN_U32,
+	TOKEN_F16,
+	TOKEN_I16,
+	TOKEN_U16,
+	TOKEN_I8,
+	TOKEN_U8,
+	TOKEN_QUAD,
+	TOKEN_DOUBLE,
+	TOKEN_FLOAT,
+	TOKEN_HALF,
+	TOKEN_LONG,
+	TOKEN_ULONG,
+	TOKEN_INT,
+	TOKEN_UINT,
+	TOKEN_SHORT,
+	TOKEN_USHORT,
+	TOKEN_BYTE,
+	TOKEN_CHAR,
+	TOKEN_BOOL,
+	TOKEN_ISIZE,
+	TOKEN_USIZE,
+
+	// Literals.
+
+	// In order to make the grammar
+	// non ambiguous, we split tokens at the
+	// lexer level
+	TOKEN_TYPE_IDENT,
+	TOKEN_CAPS_IDENT,
+	TOKEN_VAR_IDENT,
+
+	// We want to parse @foo / #foo / $foo separately.
+	// Otherwise we allow things like "@ foo" which would be pretty bad.
+	TOKEN_AT_IDENT,
+	TOKEN_HASH_IDENT,
+	TOKEN_DOLLAR_IDENT,
+
+	TOKEN_STRING,
+	TOKEN_INTEGER,
+	TOKEN_REAL,
+
+	// Keywords.
+	TOKEN_ALIAS, // Reserved
+	TOKEN_AS,
+	TOKEN_ASM,
+	TOKEN_BREAK,
+	TOKEN_CASE,
+	TOKEN_CAST,
+	TOKEN_CATCH,
+	TOKEN_CONST,
+	TOKEN_CONTINUE,
+	TOKEN_DEFAULT,
+	TOKEN_DEFER,
+	TOKEN_DO,
+	TOKEN_ELSE,
+	TOKEN_ENUM,
+	TOKEN_ERROR,
+	TOKEN_FALSE,
+	TOKEN_FOR,
+	TOKEN_FUNC,
+	TOKEN_GENERIC,
+	TOKEN_GOTO,
+	TOKEN_IF,
+	TOKEN_IMPORT,
+	TOKEN_LOCAL,
+	TOKEN_MACRO,
+	TOKEN_MODULE,
+	TOKEN_NIL,
+	TOKEN_PUBLIC,
+	TOKEN_RETURN,
+	TOKEN_STRUCT,
+	TOKEN_SWITCH,
+	TOKEN_THROW,
+	TOKEN_THROWS,
+	TOKEN_TRUE,
+	TOKEN_TRY,
+	TOKEN_TYPE, // Reserved
+	TOKEN_TYPEDEF,
+	TOKEN_UNION,
+	TOKEN_UNTIL,
+	TOKEN_VAR, // Reserved
+	TOKEN_VOID,
+	TOKEN_VOLATILE,
+	TOKEN_WHILE,
+
+
+	TOKEN_C_USHORT,
+	TOKEN_C_SHORT,
+	TOKEN_C_INT,
+	TOKEN_C_UINT,
+	TOKEN_C_LONG,
+	TOKEN_C_ULONG,
+	TOKEN_C_LONGLONG,
+	TOKEN_C_ULONGLONG,
+	TOKEN_C_LONGDOUBLE,
+
+	INVALID_TOKEN,
+	TOKEN_EOF,
+
+} TokenType;
+
+const char *token_type_to_string(TokenType type);
+TokenType identifier_type(const char* restrict start, int len);
+TokenType ident_type_fnv1(const char *restrict start, int len);
--- a/src/compiler_tests/benchmark.c
+++ b/src/compiler_tests/benchmark.c
@@ -0,0 +1,17 @@
+// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "benchmark.h"
+#include <time.h>
+
+static int begin = 0;
+
+void bench_begin(void)
+{
+	begin = clock();
+}
+double bench_mark(void)
+{
+	return (clock() - begin) / (double)CLOCKS_PER_SEC;
+}
--- a/src/compiler_tests/benchmark.h
+++ b/src/compiler_tests/benchmark.h
@@ -0,0 +1,9 @@
+#pragma once
+
+// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+void bench_begin(void);
+double bench_mark(void);
--- a/src/compiler_tests/tests.c
+++ b/src/compiler_tests/tests.c
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <compiler/lexer.h>
+#include "tests.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "benchmark.h"
+
+#define TEST_ASSERT(cond, text, ...) do { if (!(cond)) { printf("\nTEST FAILED: " text "\n", ##__VA_ARGS__); exit(-1); } } while (0)
+static void test_lexer(void)
+{
+#ifdef __OPTIMIZE__
+	printf("--- RUNNING OPTIMIZED ---\n");
+#endif
+	printf("Begin lexer testing.\n");
+	printf("1. Check number of keywords...");
+	int tokens_found = 0;
+	const int EXPECTED_TOKENS = 81;
+	const char* tokens[INVALID_TOKEN];
+	int len[INVALID_TOKEN];
+	for (int i = 0; i < INVALID_TOKEN; i++)
+	{
+		const char* token = token_type_to_string((TokenType)i);
+		tokens[i] = token;
+		len[i] = strlen(token);
+		TokenType type = identifier_type(token, len[i]);
+		TokenType type2 = ident_type_fnv1(token, len[i]);
+
+		if (type != TOKEN_VAR_IDENT)
+		{
+			tokens_found++;
+			TEST_ASSERT(type == i, "Mismatch on token %s", token);
+			if (type2 != type)
+			{
+				printf("\n(fnv1) Test mismatch on token %s, generated %s\n", token, token_type_to_string(type2));
+			}
+		}
+		tokens[i] = "byte";
+		len[i] = 4;
+	}
+	printf(" %d found.\n", tokens_found);
+	TEST_ASSERT(ident_type_fnv1("alias ", 6) == TOKEN_VAR_IDENT, "Error in fnv1 ident");
+	TEST_ASSERT(identifier_type("alias ", 6) == TOKEN_VAR_IDENT, "Error in switch ident");
+	TEST_ASSERT(ident_type_fnv1("alias ", 5) != TOKEN_VAR_IDENT, "Error in fnv1 ident2");
+	TEST_ASSERT(identifier_type("alias ", 5) != TOKEN_VAR_IDENT, "Error in switch ident2");
+	TEST_ASSERT(tokens_found == EXPECTED_TOKENS, "Unexpected number of identifiers! Expected %d.", EXPECTED_TOKENS);
+
+	const int BENCH_REPEATS = 10000000;
+
+	printf("2. Test keyword lexing speed (switch)... ");
+	bench_begin();
+	for (int b = 0; b < BENCH_REPEATS; b++)
+	{
+		for (int i = 0; i < INVALID_TOKEN; i++)
+		{
+			identifier_type(tokens[i], len[i]);
+		}
+	}
+	printf("complete in %fs\n", bench_mark());
+
+	printf("3. Test keyword lexing speed (fnv1)... ");
+	bench_begin();
+	for (int b = 0; b < BENCH_REPEATS; b++)
+	{
+		for (int i = 0; i < INVALID_TOKEN; i++)
+		{
+			ident_type_fnv1(tokens[i], len[i]);
+		}
+	}
+	printf("complete in %fs\n", bench_mark());
+
+
+	exit(0);
+}
+
+void compiler_tests(void)
+{
+	test_lexer();
+}
--- a/src/compiler_tests/tests.h
+++ b/src/compiler_tests/tests.h
@@ -0,0 +1,8 @@
+#pragma once
+
+// Copyright (c) 2019 Christoffer Lerno. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+void compiler_tests(void);
--- a/src/main.c
+++ b/src/main.c
@@ -2,7 +2,7 @@
 #include "build/build_options.h"
 #include "build/project_creation.h"
 #include "utils/errors.h"
-
+#include "compiler_tests/tests.h"

 int main(int argc, const char *argv[])
 {
@@ -12,6 +12,8 @@ int main(int argc, const char *argv[])
 		case COMMAND_INIT:
 			create_project();
 			break;
+		case COMMAND_UNIT_TEST:
+			compiler_tests();
 		case COMMAND_COMPILE:
 		case COMMAND_COMPILE_RUN:
 		case COMMAND_MISSING:
--- a/src/utils/errors.h
+++ b/src/utils/errors.h
@@ -4,4 +4,18 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+#include <stdio.h>
+#include <stdlib.h>
+
 #define error_exit(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); exit(EXIT_FAILURE); } while(0)
+
+#define FATAL_ERROR(_string, ...) do { printf("FATAL ERROR at %s:%d: " _string, __func__, __LINE__, ##__VA_ARGS__); printf("\n"); exit(-1); } while(0)
+
+#define UNREACHABLE FATAL_ERROR("Cannot reach %s:%d", __func__, __LINE__);
+#define TODO FATAL_ERROR("Not done yet %s:%d", __func__, __LINE__);
+
+#define TEST_ASSERT(_condition, _string, ...) while (!(_condition)) { FATAL_ERROR(_string, ##__VA_ARGS__); }
+
+#define EXPECT(_string, _value, _expected) \
+ do { long long __tempval1 = _value; long long __tempval2 = _expected; \
+    TEST_ASSERT(__tempval1 == __tempval2, "Checking " _string ": expected %lld but was %lld.", __tempval2, __tempval1); } while(0);