From 7439dccc532bfb5655f10532d4c8b1bf7a49819c Mon Sep 17 00:00:00 2001 From: Christoffer Lerno Date: Thu, 25 Jul 2019 18:57:35 +0200 Subject: [PATCH] Add a little testing and keyword parsing x2 --- CMakeLists.txt | 3 +- src/build/build_options.c | 5 + src/build/build_options.h | 1 + src/compiler/lexer.c | 806 +++++++++++++++++++++++++++++++++ src/compiler/lexer.h | 182 ++++++++ src/compiler_tests/benchmark.c | 17 + src/compiler_tests/benchmark.h | 9 + src/compiler_tests/tests.c | 82 ++++ src/compiler_tests/tests.h | 8 + src/main.c | 4 +- src/utils/errors.h | 14 + 11 files changed, 1129 insertions(+), 2 deletions(-) create mode 100644 src/compiler/lexer.c create mode 100644 src/compiler/lexer.h create mode 100644 src/compiler_tests/benchmark.c create mode 100644 src/compiler_tests/benchmark.h create mode 100644 src/compiler_tests/tests.c create mode 100644 src/compiler_tests/tests.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b0f16fc05..7b2ccf434 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,6 @@ cmake_minimum_required(VERSION 3.13) project(c3c C) +set(CMAKE_CXX_FLAGS_RELEASE "-O3") set(CMAKE_C_STANDARD 11) @@ -14,4 +15,4 @@ add_executable(c3c src/utils/errors.c src/utils/file_utils.c src/utils/string_utils.c - ) \ No newline at end of file + src/compiler/lexer.c src/compiler/lexer.h src/compiler_tests/tests.c src/compiler_tests/tests.h src/compiler_tests/benchmark.c src/compiler_tests/benchmark.h) \ No newline at end of file diff --git a/src/build/build_options.c b/src/build/build_options.c index 17509d642..bc0cab9bf 100644 --- a/src/build/build_options.c +++ b/src/build/build_options.c @@ -128,6 +128,11 @@ static void parse_command(void) build_options.project_name = next_arg(); return; } + if (arg_match("utest")) + { + build_options.command = COMMAND_UNIT_TEST; + return; + } if (arg_match("compile")) { build_options.command = COMMAND_COMPILE; diff --git a/src/build/build_options.h b/src/build/build_options.h index ab8fcf500..bfbb07874 100644 --- a/src/build/build_options.h +++ b/src/build/build_options.h @@ -20,6 +20,7 @@ typedef enum COMMAND_DIST, COMMAND_DOCS, COMMAND_BENCH, + COMMAND_UNIT_TEST, } CompilerCommand; typedef struct diff --git a/src/compiler/lexer.c b/src/compiler/lexer.c new file mode 100644 index 000000000..ecef6258e --- /dev/null +++ b/src/compiler/lexer.c @@ -0,0 +1,806 @@ +// Copyright (c) 2019 Christoffer Lerno. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include "lexer.h" +#include "../utils/string_utils.h" +#include +#include "../utils/errors.h" + +typedef struct +{ + const char *begin; + const char *start; + const char *current; + uint16_t source_file; +/* LexerState lexer_state; + File *current_file; + Token saved_tok; + Token saved_prev_tok; + SourceLoc last_in_range;*/ +} Lexer; + + +Lexer lexer; + + +#define MATCH_KEYWORD_LEN(_keyword, _type) \ + ((sizeof(_keyword) != len + 1) ? TOKEN_VAR_IDENT : check_keyword(start, len, _keyword, _type)) + +#define MATCH_KEYWORD(_keyword, _type) check_keyword(start, len, _keyword, _type) + +// Yes this is an ugly hand written keyword identifier. It should be benchmarked against +// an table based state machine. + +static inline TokenType check_keyword(const char * restrict start, size_t len, const char * restrict keyword, TokenType type) +{ + if (memcmp(start + 1, keyword + 1, len - 1) == 0) + { + return type; + } + return TOKEN_VAR_IDENT; +} + + + +static inline TokenType c_ident(const char *restrict start, const int len) +{ + switch (start[3]) + { + case 'n': + return MATCH_KEYWORD_LEN("c_int", TOKEN_C_INT); + case 'i': + return MATCH_KEYWORD_LEN("c_uint", TOKEN_C_UINT); + case 's': + return MATCH_KEYWORD_LEN("c_ushort", TOKEN_C_USHORT); + case 'h': + return MATCH_KEYWORD_LEN("c_short", TOKEN_C_SHORT); + case 'o': + switch (len) + { + case 10: + return MATCH_KEYWORD("c_longlong", TOKEN_C_LONGLONG); + case 6: + return MATCH_KEYWORD("c_long", TOKEN_C_LONG); + case 12: + return MATCH_KEYWORD("c_longdouble", TOKEN_C_LONGDOUBLE); + default: + return TOKEN_VAR_IDENT; + } + case 'l': + return len == 11 + ? MATCH_KEYWORD("c_ulonglong", TOKEN_C_ULONGLONG) + : MATCH_KEYWORD_LEN("c_ulong", TOKEN_C_ULONG); + default: + return TOKEN_VAR_IDENT; + } +} +static inline TokenType ident_type(const char *restrict start, const int len) +{ + char current_value = start[0]; + if (len < 2) return TOKEN_VAR_IDENT; + if (current_value == 'c' && start[1] == '_') return c_ident(start, len); + if (len > 8 || !is_lower(current_value)) return TOKEN_VAR_IDENT; + switch (current_value) + { + case 'a': + if (len == 2) return MATCH_KEYWORD("as", TOKEN_AS); + switch (start[1]) + { + case 's': + return MATCH_KEYWORD_LEN("asm", TOKEN_ASM); + case 'l': + return MATCH_KEYWORD_LEN("alias", TOKEN_ALIAS); + default: + return TOKEN_VAR_IDENT; + } + case 'b': + switch (start[1]) + { + case 'o': + return MATCH_KEYWORD_LEN("bool", TOKEN_BOOL); + case 'y': + return MATCH_KEYWORD_LEN("byte", TOKEN_BYTE); + case 'r': + return MATCH_KEYWORD_LEN("break", TOKEN_BREAK); + default: + return TOKEN_VAR_IDENT; + } + case 'c': + if (len < 4) return TOKEN_VAR_IDENT; + if (len == 8) return MATCH_KEYWORD_LEN("continue", TOKEN_CONTINUE); + switch (start[3]) + { + case 't': + return MATCH_KEYWORD_LEN("cast", TOKEN_CAST); + case 'e': + return MATCH_KEYWORD_LEN("case", TOKEN_CASE); + case 'r': + return MATCH_KEYWORD_LEN("char", TOKEN_CHAR); + case 's': + return MATCH_KEYWORD_LEN("const", TOKEN_CONST); + case 'c': + return MATCH_KEYWORD_LEN("catch", TOKEN_CATCH); + default: + return TOKEN_VAR_IDENT; + + } + case 'd': + if (len < 5) return MATCH_KEYWORD_LEN("do", TOKEN_DO); + switch (start[3]) + { + case 'e': + return MATCH_KEYWORD_LEN("defer", TOKEN_DEFER); + case 'a': + return MATCH_KEYWORD_LEN("default", TOKEN_DEFAULT); + case 'b': + return MATCH_KEYWORD_LEN("double", TOKEN_DOUBLE); + default: + return TOKEN_VAR_IDENT; + } + case 'e': + switch (start[1]) + { + case 'l': + return MATCH_KEYWORD_LEN("else", TOKEN_ELSE); + case 'n': + return MATCH_KEYWORD_LEN("enum", TOKEN_ENUM); + case 'r': + return MATCH_KEYWORD_LEN("error", TOKEN_ERROR); + default: + return TOKEN_VAR_IDENT; + } + case 'f': + switch (start[2]) + { + case '6': + return MATCH_KEYWORD_LEN("f16", TOKEN_F16); + case '2': + if (len == 4) return MATCH_KEYWORD_LEN("f128", TOKEN_F128); + return MATCH_KEYWORD_LEN("f32", TOKEN_F32); + case '4': + return MATCH_KEYWORD_LEN("f64", TOKEN_F64); + case '5': + return MATCH_KEYWORD_LEN("f256", TOKEN_F256); + case 'r': + return MATCH_KEYWORD_LEN("for", TOKEN_FOR); + case 'l': + return MATCH_KEYWORD_LEN("false", TOKEN_FALSE); + case 'o': + return MATCH_KEYWORD_LEN("float", TOKEN_FLOAT); + case 'n': + return MATCH_KEYWORD_LEN("func", TOKEN_FUNC); + + default: + return TOKEN_VAR_IDENT; + } + case 'g': + switch (start[1]) + { + case 'o': + return MATCH_KEYWORD_LEN("goto", TOKEN_GOTO); + case 'e': + return MATCH_KEYWORD_LEN("generic", TOKEN_GENERIC); + default: + return TOKEN_VAR_IDENT; + } + case 'h': + return MATCH_KEYWORD_LEN("half", TOKEN_HALF); + case 'i': + switch (start[1]) + { + case 'f': + return MATCH_KEYWORD_LEN("if", TOKEN_IF); + case '8': + return MATCH_KEYWORD_LEN("i8", TOKEN_I8); + case '6': + return MATCH_KEYWORD_LEN("i64", TOKEN_I64); + case '2': + return MATCH_KEYWORD_LEN("i256", TOKEN_I256); + case '3': + return MATCH_KEYWORD_LEN("i32", TOKEN_I32); + case '1': + if (len == 4) return MATCH_KEYWORD_LEN("i128", TOKEN_I128); + return MATCH_KEYWORD_LEN("i16", TOKEN_I16); + case 'n': + return MATCH_KEYWORD_LEN("int", TOKEN_INT); + case 'm': + return MATCH_KEYWORD_LEN("import", TOKEN_IMPORT); + case 's': + return MATCH_KEYWORD_LEN("isize", TOKEN_ISIZE); + default: + return TOKEN_VAR_IDENT; + } + case 'l': + if (len < 4) return TOKEN_VAR_IDENT; + switch (start[2]) + { + case 'n': + return MATCH_KEYWORD_LEN("long", TOKEN_LONG); + case 'c': + return MATCH_KEYWORD_LEN("local", TOKEN_LOCAL); + default: + return TOKEN_VAR_IDENT; + } + case 'm': + switch (start[1]) + { + case 'a': + return MATCH_KEYWORD_LEN("macro", TOKEN_MACRO); + case 'o': + return MATCH_KEYWORD_LEN("module", TOKEN_MODULE); + default: + return TOKEN_VAR_IDENT; + } + case 'n': + return MATCH_KEYWORD_LEN("nil", TOKEN_NIL); + case 'p': + return MATCH_KEYWORD_LEN("public", TOKEN_PUBLIC); + case 'q': + return MATCH_KEYWORD_LEN("quad", TOKEN_QUAD); + case 'r': + return MATCH_KEYWORD_LEN("return", TOKEN_RETURN); + case 's': + switch (start[1]) + { + case 'h': + return MATCH_KEYWORD_LEN("short", TOKEN_SHORT); + case 't': + return MATCH_KEYWORD_LEN("struct", TOKEN_STRUCT); + case 'w': + return MATCH_KEYWORD_LEN("switch", TOKEN_SWITCH); + default: + return TOKEN_VAR_IDENT; + } + case 't': + if (len < 3) return TOKEN_VAR_IDENT; + switch (start[2]) + { + case 'p': + if (len == 7) return MATCH_KEYWORD_LEN("typedef", TOKEN_TYPEDEF); + return MATCH_KEYWORD_LEN("type", TOKEN_TYPE); + case 'u': + return MATCH_KEYWORD_LEN("true", TOKEN_TRUE); + case 'y': + return MATCH_KEYWORD_LEN("try", TOKEN_TRY); + case 'r': + if (len == 6) return MATCH_KEYWORD_LEN("throws", TOKEN_THROWS); + return MATCH_KEYWORD_LEN("throw", TOKEN_THROW); + default: + return TOKEN_VAR_IDENT; + } + case 'u': + if (len < 3) return MATCH_KEYWORD_LEN("u8", TOKEN_U8); + switch (start[1]) + { + case '1': + if (len == 4) return MATCH_KEYWORD("u128", TOKEN_U128); + return MATCH_KEYWORD_LEN("u16", TOKEN_U16); + case '2': + return MATCH_KEYWORD_LEN("u256", TOKEN_U256); + case '3': + return MATCH_KEYWORD_LEN("u32", TOKEN_U32); + case '6': + return MATCH_KEYWORD_LEN("u64", TOKEN_U64); + case 'i': + return MATCH_KEYWORD_LEN("uint", TOKEN_UINT); + case 'n': + if (start[2] == 't') return MATCH_KEYWORD_LEN("until", TOKEN_UNTIL); + return MATCH_KEYWORD_LEN("union", TOKEN_UNION); + case 'l': + return MATCH_KEYWORD_LEN("ulong", TOKEN_ULONG); + case 's': + if (len == 5) return MATCH_KEYWORD("usize", TOKEN_USIZE); + return MATCH_KEYWORD_LEN("ushort", TOKEN_USHORT); + default: + return TOKEN_VAR_IDENT; + } + case 'v': + if (len < 3) return TOKEN_VAR_IDENT; + switch (start[2]) + { + case 'r': + return MATCH_KEYWORD_LEN("var", TOKEN_VAR); + case 'i': + return MATCH_KEYWORD_LEN("void", TOKEN_VOID); + case 'l': + return MATCH_KEYWORD_LEN("volatile", TOKEN_VOLATILE); + default: + return TOKEN_VAR_IDENT; + } + case 'w': + return MATCH_KEYWORD_LEN("while", TOKEN_WHILE); + default: + return TOKEN_VAR_IDENT; + } +} + +#define PRIME 0x01000193 +#define SEED 0x811C9DC5 + +#define FNV1(a, seed) ((uint32_t)((((unsigned int)(a)) ^ (seed)) * PRIME)) +#define HASH(a, b, c) (FNV1(c, FNV1((a), FNV1(b, SEED))) & 0x1FFu) + +TokenType ident_type_fnv1(const char *restrict start, int len) +{ + char current_value = start[0]; + if (len < 2) return TOKEN_VAR_IDENT; + char second = start[1]; + if (current_value == 'c' && second == '_') return c_ident(start, len); + if (len > 8 || !is_lower(current_value)) return TOKEN_VAR_IDENT; + switch (HASH(len, current_value, second)) + { + case HASH(2, 'a', 's'): + return MATCH_KEYWORD_LEN("as", TOKEN_AS); + case HASH(3, 'a', 's'): + return MATCH_KEYWORD_LEN("asm", TOKEN_ASM); + case HASH(5, 'a', 'l'): + return MATCH_KEYWORD_LEN("alias", TOKEN_ALIAS); + case HASH(4, 'b', 'o'): + return MATCH_KEYWORD_LEN("bool", TOKEN_BOOL); + case HASH(4, 'b', 'y'): + return MATCH_KEYWORD_LEN("byte", TOKEN_BYTE); + case HASH(5, 'b', 'r'): + return MATCH_KEYWORD_LEN("break", TOKEN_BREAK); + case HASH(8, 'c', 'o'): + return MATCH_KEYWORD_LEN("continue", TOKEN_CONTINUE); + case HASH(4, 'c', 'a'): + return len > 3 && start[3] == 't' ? MATCH_KEYWORD_LEN("cast", TOKEN_CAST) : MATCH_KEYWORD_LEN("case", TOKEN_CASE); + case HASH(5, 'c', '_'): + return MATCH_KEYWORD_LEN("c_int", TOKEN_C_INT); + case HASH(6, 'c', '_'): + return len > 3 && start[2] == 'u' ? MATCH_KEYWORD_LEN("c_uint", TOKEN_C_UINT) : MATCH_KEYWORD_LEN("c_long", TOKEN_C_LONG); + case HASH(7, 'c', '_'): + return MATCH_KEYWORD_LEN("c_short", TOKEN_C_SHORT); + case HASH(8, 'c', '_'): + return MATCH_KEYWORD_LEN("c_ushort", TOKEN_C_USHORT); + case HASH(4, 'c', 'h'): + return MATCH_KEYWORD_LEN("char", TOKEN_CHAR); + case HASH(5, 'c', 'o'): + return MATCH_KEYWORD_LEN("const", TOKEN_CONST); + case HASH(5, 'c', 'a'): + return MATCH_KEYWORD_LEN("catch", TOKEN_CATCH); + case HASH(2, 'd', 'o'): + return MATCH_KEYWORD_LEN("do", TOKEN_DO); + case HASH(5, 'd', 'e'): + return MATCH_KEYWORD_LEN("defer", TOKEN_DEFER); + case HASH(7, 'd', 'e'): + return MATCH_KEYWORD_LEN("default", TOKEN_DEFAULT); + case HASH(6, 'd', 'o'): + return MATCH_KEYWORD_LEN("double", TOKEN_DOUBLE); + case HASH(4, 'e', 'l'): + return MATCH_KEYWORD_LEN("else", TOKEN_ELSE); + case HASH(4, 'e', 'n'): + return MATCH_KEYWORD_LEN("enum", TOKEN_ENUM); + case HASH(5, 'e', 'r'): + return MATCH_KEYWORD_LEN("error", TOKEN_ERROR); + case HASH(3, 'f', '1'): + return MATCH_KEYWORD_LEN("f16", TOKEN_F16); + case HASH(4, 'f', '1'): + return MATCH_KEYWORD_LEN("f128", TOKEN_F128); + case HASH(3, 'f', '3'): + return MATCH_KEYWORD_LEN("f32", TOKEN_F32); + case HASH(3, 'f', '6'): + return MATCH_KEYWORD_LEN("f64", TOKEN_F64); + case HASH(4, 'f', '2'): + return MATCH_KEYWORD_LEN("f256", TOKEN_F256); + case HASH(3, 'f', 'o'): + return MATCH_KEYWORD_LEN("for", TOKEN_FOR); + case HASH(5, 'f', 'a'): + return MATCH_KEYWORD_LEN("false", TOKEN_FALSE); + case HASH(5, 'f', 'l'): + return MATCH_KEYWORD_LEN("float", TOKEN_FLOAT); + case HASH(4, 'f', 'u'): + return MATCH_KEYWORD_LEN("func", TOKEN_FUNC); + case HASH(4, 'g', 'o'): + return MATCH_KEYWORD_LEN("goto", TOKEN_GOTO); + case HASH(7, 'g', 'e'): + return MATCH_KEYWORD_LEN("generic", TOKEN_GENERIC); + case HASH(4, 'h', 'a'): + return MATCH_KEYWORD_LEN("half", TOKEN_HALF); + case HASH(2, 'i', 'f'): + return MATCH_KEYWORD_LEN("if", TOKEN_IF); + case HASH(2, 'i', '8'): + return MATCH_KEYWORD_LEN("i8", TOKEN_I8); + case HASH(3, 'i', '6'): + return MATCH_KEYWORD_LEN("i64", TOKEN_I64); + case HASH(4, 'i', '2'): + return MATCH_KEYWORD_LEN("i256", TOKEN_I256); + case HASH(3, 'i', '3'): + return MATCH_KEYWORD_LEN("i32", TOKEN_I32); + case HASH(4, 'i', '1'): + return MATCH_KEYWORD_LEN("i128", TOKEN_I128); + case HASH(3, 'i', '1'): + return MATCH_KEYWORD_LEN("i16", TOKEN_I16); + case HASH(3, 'i', 'n'): + return MATCH_KEYWORD_LEN("int", TOKEN_INT); + case HASH(6, 'i', 'm'): + return MATCH_KEYWORD_LEN("import", TOKEN_IMPORT); + case HASH(5, 'i', 's'): + return MATCH_KEYWORD_LEN("isize", TOKEN_ISIZE); + case HASH(4, 'l', 'o'): + return MATCH_KEYWORD_LEN("long", TOKEN_LONG); + case HASH(5, 'l', 'o'): + return MATCH_KEYWORD_LEN("local", TOKEN_LOCAL); + case HASH(5, 'm', 'a'): + return MATCH_KEYWORD_LEN("macro", TOKEN_MACRO); + case HASH(6, 'm', 'o'): + return MATCH_KEYWORD_LEN("module", TOKEN_MODULE); + case HASH(3, 'n', 'i'): + return MATCH_KEYWORD_LEN("nil", TOKEN_NIL); + case HASH(6, 'p', 'u'): + return MATCH_KEYWORD_LEN("public", TOKEN_PUBLIC); + case HASH(4, 'q', 'u'): + return MATCH_KEYWORD_LEN("quad", TOKEN_QUAD); + case HASH(6, 'r', 'e'): + return MATCH_KEYWORD_LEN("return", TOKEN_RETURN); + case HASH(5, 's', 'h'): + return MATCH_KEYWORD_LEN("short", TOKEN_SHORT); + case HASH(6, 's', 't'): + return MATCH_KEYWORD_LEN("struct", TOKEN_STRUCT); + case HASH(6, 's', 'w'): + return MATCH_KEYWORD_LEN("switch", TOKEN_SWITCH); + case HASH(7, 't', 'y'): + return MATCH_KEYWORD_LEN("typedef", TOKEN_TYPEDEF); + case HASH(4, 't', 'y'): + return MATCH_KEYWORD_LEN("type", TOKEN_TYPE); + case HASH(4, 't', 'r'): + return MATCH_KEYWORD_LEN("true", TOKEN_TRUE); + case HASH(3, 't', 'r'): + return MATCH_KEYWORD_LEN("try", TOKEN_TRY); + case HASH(6, 't', 'h'): + return MATCH_KEYWORD_LEN("throws", TOKEN_THROWS); + case HASH(5, 't', 'h'): + return MATCH_KEYWORD_LEN("throw", TOKEN_THROW); + case HASH(2, 'u', '8'): + return MATCH_KEYWORD_LEN("u8", TOKEN_U8); + case HASH(4, 'u', '1'): + return MATCH_KEYWORD_LEN("u128", TOKEN_U128); + case HASH(3, 'u', '1'): + return MATCH_KEYWORD_LEN("u16", TOKEN_U16); + case HASH(4, 'u', '2'): + return MATCH_KEYWORD_LEN("u256", TOKEN_U256); + case HASH(3, 'u', '3'): + return MATCH_KEYWORD_LEN("u32", TOKEN_U32); + case HASH(3, 'u', '6'): + return MATCH_KEYWORD_LEN("u64", TOKEN_U64); + case HASH(4, 'u', 'i'): + return MATCH_KEYWORD_LEN("uint", TOKEN_UINT); + case HASH(5, 'u', 'n'): + if (start[2] == 't') return MATCH_KEYWORD_LEN("until", TOKEN_UNTIL); + return MATCH_KEYWORD_LEN("union", TOKEN_UNION); + case HASH(5, 'u', 'l'): + return MATCH_KEYWORD_LEN("ulong", TOKEN_ULONG); + case HASH(5, 'u', 's'): + return MATCH_KEYWORD_LEN("usize", TOKEN_USIZE); + case HASH(6, 'u', 's'): + return MATCH_KEYWORD_LEN("ushort", TOKEN_USHORT); + case HASH(3, 'v', 'a'): + return MATCH_KEYWORD_LEN("var", TOKEN_VAR); + case HASH(4, 'v', 'o'): + return MATCH_KEYWORD_LEN("void", TOKEN_VOID); + case HASH(8, 'v', 'o'): + return MATCH_KEYWORD_LEN("volatile", TOKEN_VOLATILE); + case HASH(5, 'w', 'h'): + return MATCH_KEYWORD_LEN("while", TOKEN_WHILE); + default: + return TOKEN_VAR_IDENT; + } +} + + +#undef HASH4V +#undef HASH4 +#undef HASH3V +#undef HASH3 +#undef HASH2V +#undef HASH2 +#undef HASH + +#undef MATCH_KEYWORD + +TokenType identifier_type(const char* restrict start, int len) +{ + return ident_type(start, len); +} + +const char *token_type_to_string(TokenType type) +{ + switch (type) + { + case TOKEN_LPAREN: + return "("; + case TOKEN_RPAREN: + return ")"; + case TOKEN_LBRACE: + return "{"; + case TOKEN_RBRACE: + return "}"; + case TOKEN_LBRACKET: + return "["; + case TOKEN_RBRACKET: + return "]"; + case TOKEN_COMMA: + return ","; + case TOKEN_DOT: + return "."; + case TOKEN_EOS: + return ";"; + case TOKEN_PLUS: + return "+"; + case TOKEN_PLUSPLUS: + return "++"; + case TOKEN_PLUS_ASSIGN: + return "+="; + case TOKEN_BIT_NOT: + return "~"; + case TOKEN_NOT: + return "!"; + case TOKEN_MINUS: + return "-"; + case TOKEN_MINUSMINUS: + return "--"; + case TOKEN_MINUS_ASSIGN: + return "-="; + case TOKEN_STAR: + return "*"; + case TOKEN_MULT_ASSIGN: + return "*="; + case TOKEN_MOD: + return "%"; + case TOKEN_MOD_ASSIGN: + return "%="; + case TOKEN_DIV: + return "/"; + case TOKEN_DIV_ASSIGN: + return "/="; + case TOKEN_NOT_EQUAL: + return "!="; + case TOKEN_EQ: + return "="; + case TOKEN_EQEQ: + return "=="; + case TOKEN_COLON: + return ":"; + case TOKEN_COLCOLON: + return "::"; + case TOKEN_DOTDOT: + return ".."; + case TOKEN_ELIPSIS: + return "..."; + case TOKEN_GREATER: + return ">"; + case TOKEN_GREATER_EQ: + return ">="; + case TOKEN_RIGHT_SHIFT: + return ">>"; + case TOKEN_RIGHT_SHIFT_ASSIGN: + return ">>="; + case TOKEN_LESS: + return "<"; + case TOKEN_LESS_EQ: + return "<="; + case TOKEN_LEFT_SHIFT: + return "<<"; + case TOKEN_LEFT_SHIFT_ASSIGN: + return "<<="; + case TOKEN_ARROW: + return "->"; + case TOKEN_AND: + return "&&"; + case TOKEN_AND_ASSIGN: + return "&&="; + case TOKEN_AMP: + return "&"; + case TOKEN_BIT_AND_ASSIGN: + return "&="; + case TOKEN_OR: + return "||"; + case TOKEN_OR_ASSIGN: + return "||="; + case TOKEN_BIT_OR: + return "|"; + case TOKEN_BIT_OR_ASSIGN: + return "|="; + case TOKEN_BIT_XOR: + return "^"; + case TOKEN_BIT_XOR_ASSIGN: + return "^="; + case TOKEN_VAR_IDENT: + return ""; + case TOKEN_TYPE_IDENT: + return ""; + case TOKEN_STRING: + return ""; + case TOKEN_INTEGER: + return ""; + case TOKEN_REAL: + return ""; + case TOKEN_QUESTION: + return "?"; + case TOKEN_ELVIS: + return "?:"; + case TOKEN_VOID: + return "void"; + case TOKEN_ALIAS: + return "alias"; + case TOKEN_CONST: + return "const"; + case TOKEN_VOLATILE: + return "volatile"; + case TOKEN_ELSE: + return "else"; + case TOKEN_FALSE: + return "false"; + case TOKEN_CONTINUE: + return "continue"; + case TOKEN_FUNC: + return "func"; + case TOKEN_FOR: + return "for"; + case TOKEN_IMPORT: + return "import"; + case TOKEN_MODULE: + return "module"; + case TOKEN_IF: + return "if"; + case TOKEN_NIL: + return "nil"; + case TOKEN_RETURN: + return "return"; + case TOKEN_GOTO: + return "goto"; + case TOKEN_DEFER: + return "defer"; + case TOKEN_TRUE: + return "true"; + case TOKEN_WHILE: + return "while"; + case TOKEN_CASE: + return "case"; + case TOKEN_ASM: + return "asm"; + case TOKEN_DEFAULT: + return "default"; + case TOKEN_SWITCH: + return "switch"; + case TOKEN_UNTIL: + return "until"; + case TOKEN_BREAK: + return "break"; + case TOKEN_TYPE: + return "type"; + case TOKEN_DO: + return "do"; + case TOKEN_PUBLIC: + return "public"; + case TOKEN_LOCAL: + return "local"; + case TOKEN_STRUCT: + return "struct"; + case TOKEN_UNION: + return "union"; + case TOKEN_ENUM: + return "enum"; + case TOKEN_AT: + return "@"; + case TOKEN_AS: + return "as"; + case TOKEN_ERROR: + return ""; + case TOKEN_EOF: + return ""; + case TOKEN_CAST: + return "cast"; + case TOKEN_C_LONGDOUBLE: + return "c_longdouble"; + case TOKEN_C_USHORT: + return "c_ushort"; + case TOKEN_C_UINT: + return "c_uint"; + case TOKEN_C_ULONG: + return "c_ulong"; + case TOKEN_C_ULONGLONG: + return "c_ulonglong"; + case TOKEN_C_SHORT: + return "c_ishort"; + case TOKEN_C_INT: + return "c_int"; + case TOKEN_C_LONG: + return "c_long"; + case TOKEN_C_LONGLONG: + return "c_longlong"; + case TOKEN_MACRO: + return "macro"; + case TOKEN_F256: + return "f256"; + case TOKEN_I256: + return "i256"; + case TOKEN_U256: + return "u256"; + case TOKEN_F128: + return "f128"; + case TOKEN_I128: + return "i128"; + case TOKEN_U128: + return "u128"; + case TOKEN_F64: + return "f64"; + case TOKEN_I64: + return "i64"; + case TOKEN_U64: + return "u64"; + case TOKEN_F32: + return "f32"; + case TOKEN_I32: + return "i32"; + case TOKEN_U32: + return "u32"; + case TOKEN_F16: + return "f16"; + case TOKEN_I16: + return "i16"; + case TOKEN_U16: + return "u16"; + case TOKEN_I8: + return "i8"; + case TOKEN_U8: + return "u8"; + case TOKEN_BOOL: + return "bool"; + case TOKEN_QUAD: + return "quad"; + case TOKEN_DOUBLE: + return "double"; + case TOKEN_FLOAT: + return "float"; + case TOKEN_LONG: + return "long"; + case TOKEN_ULONG: + return "ulong"; + case TOKEN_INT: + return "int"; + case TOKEN_UINT: + return "uint"; + case TOKEN_SHORT: + return "short"; + case TOKEN_USHORT: + return "ushort"; + case TOKEN_BYTE: + return "byte"; + case TOKEN_CHAR: + return "char"; + case TOKEN_ISIZE: + return "isize"; + case TOKEN_USIZE: + return "usize"; + case TOKEN_CAPS_IDENT: + return ""; + case TOKEN_AT_IDENT: + return "<@ident>"; + case TOKEN_HASH_IDENT: + return "<#ident>"; + case TOKEN_DOLLAR_IDENT: + return "<$ident>"; + case TOKEN_CATCH: + return "catch"; + case TOKEN_GENERIC: + return "generic"; + case TOKEN_THROW: + return "throw"; + case TOKEN_THROWS: + return "throws"; + case TOKEN_TRY: + return "try"; + case TOKEN_TYPEDEF: + return "typedef"; + case TOKEN_VAR: + return "var"; + case TOKEN_HALF: + return "half"; + case INVALID_TOKEN: + return "<\?\?\?>"; + } + UNREACHABLE +} \ No newline at end of file diff --git a/src/compiler/lexer.h b/src/compiler/lexer.h new file mode 100644 index 000000000..e243e89b5 --- /dev/null +++ b/src/compiler/lexer.h @@ -0,0 +1,182 @@ +#pragma once + +// Copyright (c) 2019 Christoffer Lerno. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + + +typedef enum _TokenType +{ + // Single-character tokens. + TOKEN_LPAREN, + TOKEN_RPAREN, + TOKEN_LBRACE, + TOKEN_RBRACE, + TOKEN_LBRACKET, + TOKEN_RBRACKET, + TOKEN_COMMA, + TOKEN_DOT, + TOKEN_EOS, + TOKEN_AT, + + // One or two character tokens. + TOKEN_PLUS, + TOKEN_PLUSPLUS, + TOKEN_PLUS_ASSIGN, + TOKEN_BIT_NOT, + TOKEN_NOT, + TOKEN_MINUS, + TOKEN_MINUSMINUS, + TOKEN_MINUS_ASSIGN, + TOKEN_STAR, + TOKEN_MULT_ASSIGN, + TOKEN_DIV, + TOKEN_DIV_ASSIGN, + TOKEN_MOD, + TOKEN_MOD_ASSIGN, + TOKEN_NOT_EQUAL, + TOKEN_EQ, + TOKEN_EQEQ, + TOKEN_COLON, + TOKEN_COLCOLON, // Not used but reserved + TOKEN_DOTDOT, + TOKEN_QUESTION, + + // Three or more + TOKEN_ELIPSIS, + TOKEN_GREATER, + TOKEN_GREATER_EQ, + TOKEN_RIGHT_SHIFT, + TOKEN_RIGHT_SHIFT_ASSIGN, + TOKEN_LESS, + TOKEN_LESS_EQ, + TOKEN_LEFT_SHIFT, + TOKEN_LEFT_SHIFT_ASSIGN, + TOKEN_ARROW, // Not used but reserved + TOKEN_AND, + TOKEN_AND_ASSIGN, + TOKEN_AMP, + TOKEN_BIT_AND_ASSIGN, + TOKEN_OR, + TOKEN_OR_ASSIGN, + TOKEN_BIT_OR, + TOKEN_BIT_OR_ASSIGN, + TOKEN_BIT_XOR, + TOKEN_BIT_XOR_ASSIGN, + TOKEN_ELVIS, + + TOKEN_F256, + TOKEN_I256, + TOKEN_U256, + TOKEN_F128, + TOKEN_I128, + TOKEN_U128, + TOKEN_F64, + TOKEN_I64, + TOKEN_U64, + TOKEN_F32, + TOKEN_I32, + TOKEN_U32, + TOKEN_F16, + TOKEN_I16, + TOKEN_U16, + TOKEN_I8, + TOKEN_U8, + TOKEN_QUAD, + TOKEN_DOUBLE, + TOKEN_FLOAT, + TOKEN_HALF, + TOKEN_LONG, + TOKEN_ULONG, + TOKEN_INT, + TOKEN_UINT, + TOKEN_SHORT, + TOKEN_USHORT, + TOKEN_BYTE, + TOKEN_CHAR, + TOKEN_BOOL, + TOKEN_ISIZE, + TOKEN_USIZE, + + // Literals. + + // In order to make the grammar + // non ambiguous, we split tokens at the + // lexer level + TOKEN_TYPE_IDENT, + TOKEN_CAPS_IDENT, + TOKEN_VAR_IDENT, + + // We want to parse @foo / #foo / $foo separately. + // Otherwise we allow things like "@ foo" which would be pretty bad. + TOKEN_AT_IDENT, + TOKEN_HASH_IDENT, + TOKEN_DOLLAR_IDENT, + + TOKEN_STRING, + TOKEN_INTEGER, + TOKEN_REAL, + + // Keywords. + TOKEN_ALIAS, // Reserved + TOKEN_AS, + TOKEN_ASM, + TOKEN_BREAK, + TOKEN_CASE, + TOKEN_CAST, + TOKEN_CATCH, + TOKEN_CONST, + TOKEN_CONTINUE, + TOKEN_DEFAULT, + TOKEN_DEFER, + TOKEN_DO, + TOKEN_ELSE, + TOKEN_ENUM, + TOKEN_ERROR, + TOKEN_FALSE, + TOKEN_FOR, + TOKEN_FUNC, + TOKEN_GENERIC, + TOKEN_GOTO, + TOKEN_IF, + TOKEN_IMPORT, + TOKEN_LOCAL, + TOKEN_MACRO, + TOKEN_MODULE, + TOKEN_NIL, + TOKEN_PUBLIC, + TOKEN_RETURN, + TOKEN_STRUCT, + TOKEN_SWITCH, + TOKEN_THROW, + TOKEN_THROWS, + TOKEN_TRUE, + TOKEN_TRY, + TOKEN_TYPE, // Reserved + TOKEN_TYPEDEF, + TOKEN_UNION, + TOKEN_UNTIL, + TOKEN_VAR, // Reserved + TOKEN_VOID, + TOKEN_VOLATILE, + TOKEN_WHILE, + + + TOKEN_C_USHORT, + TOKEN_C_SHORT, + TOKEN_C_INT, + TOKEN_C_UINT, + TOKEN_C_LONG, + TOKEN_C_ULONG, + TOKEN_C_LONGLONG, + TOKEN_C_ULONGLONG, + TOKEN_C_LONGDOUBLE, + + INVALID_TOKEN, + TOKEN_EOF, + +} TokenType; + +const char *token_type_to_string(TokenType type); +TokenType identifier_type(const char* restrict start, int len); +TokenType ident_type_fnv1(const char *restrict start, int len); diff --git a/src/compiler_tests/benchmark.c b/src/compiler_tests/benchmark.c new file mode 100644 index 000000000..7b551edba --- /dev/null +++ b/src/compiler_tests/benchmark.c @@ -0,0 +1,17 @@ +// Copyright (c) 2019 Christoffer Lerno. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "benchmark.h" +#include + +static int begin = 0; + +void bench_begin(void) +{ + begin = clock(); +} +double bench_mark(void) +{ + return (clock() - begin) / (double)CLOCKS_PER_SEC; +} \ No newline at end of file diff --git a/src/compiler_tests/benchmark.h b/src/compiler_tests/benchmark.h new file mode 100644 index 000000000..6165a53fc --- /dev/null +++ b/src/compiler_tests/benchmark.h @@ -0,0 +1,9 @@ +#pragma once + +// Copyright (c) 2019 Christoffer Lerno. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + + +void bench_begin(void); +double bench_mark(void); \ No newline at end of file diff --git a/src/compiler_tests/tests.c b/src/compiler_tests/tests.c new file mode 100644 index 000000000..91f767c22 --- /dev/null +++ b/src/compiler_tests/tests.c @@ -0,0 +1,82 @@ +// Copyright (c) 2019 Christoffer Lerno. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include "tests.h" +#include +#include +#include +#include "benchmark.h" + +#define TEST_ASSERT(cond, text, ...) do { if (!(cond)) { printf("\nTEST FAILED: " text "\n", ##__VA_ARGS__); exit(-1); } } while (0) +static void test_lexer(void) +{ +#ifdef __OPTIMIZE__ + printf("--- RUNNING OPTIMIZED ---\n"); +#endif + printf("Begin lexer testing.\n"); + printf("1. Check number of keywords..."); + int tokens_found = 0; + const int EXPECTED_TOKENS = 81; + const char* tokens[INVALID_TOKEN]; + int len[INVALID_TOKEN]; + for (int i = 0; i < INVALID_TOKEN; i++) + { + const char* token = token_type_to_string((TokenType)i); + tokens[i] = token; + len[i] = strlen(token); + TokenType type = identifier_type(token, len[i]); + TokenType type2 = ident_type_fnv1(token, len[i]); + + if (type != TOKEN_VAR_IDENT) + { + tokens_found++; + TEST_ASSERT(type == i, "Mismatch on token %s", token); + if (type2 != type) + { + printf("\n(fnv1) Test mismatch on token %s, generated %s\n", token, token_type_to_string(type2)); + } + } + tokens[i] = "byte"; + len[i] = 4; + } + printf(" %d found.\n", tokens_found); + TEST_ASSERT(ident_type_fnv1("alias ", 6) == TOKEN_VAR_IDENT, "Error in fnv1 ident"); + TEST_ASSERT(identifier_type("alias ", 6) == TOKEN_VAR_IDENT, "Error in switch ident"); + TEST_ASSERT(ident_type_fnv1("alias ", 5) != TOKEN_VAR_IDENT, "Error in fnv1 ident2"); + TEST_ASSERT(identifier_type("alias ", 5) != TOKEN_VAR_IDENT, "Error in switch ident2"); + TEST_ASSERT(tokens_found == EXPECTED_TOKENS, "Unexpected number of identifiers! Expected %d.", EXPECTED_TOKENS); + + const int BENCH_REPEATS = 10000000; + + printf("2. Test keyword lexing speed (switch)... "); + bench_begin(); + for (int b = 0; b < BENCH_REPEATS; b++) + { + for (int i = 0; i < INVALID_TOKEN; i++) + { + identifier_type(tokens[i], len[i]); + } + } + printf("complete in %fs\n", bench_mark()); + + printf("3. Test keyword lexing speed (fnv1)... "); + bench_begin(); + for (int b = 0; b < BENCH_REPEATS; b++) + { + for (int i = 0; i < INVALID_TOKEN; i++) + { + ident_type_fnv1(tokens[i], len[i]); + } + } + printf("complete in %fs\n", bench_mark()); + + + exit(0); +} + +void compiler_tests(void) +{ + test_lexer(); +} \ No newline at end of file diff --git a/src/compiler_tests/tests.h b/src/compiler_tests/tests.h new file mode 100644 index 000000000..7171423c9 --- /dev/null +++ b/src/compiler_tests/tests.h @@ -0,0 +1,8 @@ +#pragma once + +// Copyright (c) 2019 Christoffer Lerno. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + + +void compiler_tests(void); \ No newline at end of file diff --git a/src/main.c b/src/main.c index a3b1a4146..624881b59 100644 --- a/src/main.c +++ b/src/main.c @@ -2,7 +2,7 @@ #include "build/build_options.h" #include "build/project_creation.h" #include "utils/errors.h" - +#include "compiler_tests/tests.h" int main(int argc, const char *argv[]) { @@ -12,6 +12,8 @@ int main(int argc, const char *argv[]) case COMMAND_INIT: create_project(); break; + case COMMAND_UNIT_TEST: + compiler_tests(); case COMMAND_COMPILE: case COMMAND_COMPILE_RUN: case COMMAND_MISSING: diff --git a/src/utils/errors.h b/src/utils/errors.h index 812989395..4b8f9625b 100644 --- a/src/utils/errors.h +++ b/src/utils/errors.h @@ -4,4 +4,18 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include +#include + #define error_exit(...) do { fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); exit(EXIT_FAILURE); } while(0) + +#define FATAL_ERROR(_string, ...) do { printf("FATAL ERROR at %s:%d: " _string, __func__, __LINE__, ##__VA_ARGS__); printf("\n"); exit(-1); } while(0) + +#define UNREACHABLE FATAL_ERROR("Cannot reach %s:%d", __func__, __LINE__); +#define TODO FATAL_ERROR("Not done yet %s:%d", __func__, __LINE__); + +#define TEST_ASSERT(_condition, _string, ...) while (!(_condition)) { FATAL_ERROR(_string, ##__VA_ARGS__); } + +#define EXPECT(_string, _value, _expected) \ + do { long long __tempval1 = _value; long long __tempval2 = _expected; \ + TEST_ASSERT(__tempval1 == __tempval2, "Checking " _string ": expected %lld but was %lld.", __tempval2, __tempval1); } while(0);