From 3b718335ecffa24004ca7afc345fae467225f152 Mon Sep 17 00:00:00 2001 From: Christoffer Lerno Date: Sat, 1 Nov 2025 00:12:32 +0100 Subject: [PATCH] - Improve multiline string parser inside compiler #2552 --- releasenotes.md | 2 + src/compiler/compiler_internal.h | 1 - src/compiler/parse_expr.c | 72 +++++++++++++++++++++++--------- src/utils/lib.h | 5 ++- src/utils/stringutils.c | 16 ++++++- 5 files changed, 73 insertions(+), 23 deletions(-) diff --git a/releasenotes.md b/releasenotes.md index 5bb46e94b..beb10c1d7 100644 --- a/releasenotes.md +++ b/releasenotes.md @@ -1,7 +1,9 @@ # C3C Release Notes ## 0.7.8 Change list + ### Changes / improvements +- Improve multiline string parser inside compiler #2552 ### Fixes - `Foo.is_eq` would return false if the type was a `typedef` and had an overload, but the underlying type was not comparable. diff --git a/src/compiler/compiler_internal.h b/src/compiler/compiler_internal.h index 5eebb1d04..0c0d03a3e 100644 --- a/src/compiler/compiler_internal.h +++ b/src/compiler/compiler_internal.h @@ -38,7 +38,6 @@ typedef uint16_t FileId; #define MAX_HASH_SIZE (512 * 1024 * 1024) #define INVALID_SPAN ((SourceSpan){ .row = 0 }) #define MAX_SCOPE_DEPTH 0x100 -#define MAX_STRING_BUFFER 0x10000 #define INITIAL_SYMBOL_MAP 0x10000 #define INITIAL_GENERIC_SYMBOL_MAP 0x1000 #define MAX_INCLUDE_DIRECTIVES 2048 diff --git a/src/compiler/parse_expr.c b/src/compiler/parse_expr.c index 7557217b2..272ba60c4 100644 --- a/src/compiler/parse_expr.c +++ b/src/compiler/parse_expr.c @@ -1347,7 +1347,7 @@ static Expr *parse_ct_arg(ParseContext *c, Expr *left, SourceSpan lhs_start) * identifier ::= CONST_IDENT | IDENT * Note: if the identifier is "return" (only possible in doc lexing "mode"), create an EXPR_RETVAL instead. */ -static Expr *parse_identifier(ParseContext *c, Expr *left, SourceSpan lhs_start) +static Expr *parse_identifier(ParseContext *c, Expr *left, SourceSpan lhs_start UNUSED) { ASSERT(!left && "Unexpected left hand side"); if (symstr(c) == kw_return) @@ -1773,7 +1773,7 @@ EXIT: return expr_int; } -Expr *parse_integer(ParseContext *c, Expr *left, SourceSpan lhs_start) +Expr *parse_integer(ParseContext *c, Expr *left UNUSED, SourceSpan lhs_start UNUSED) { return parse_integer_expr(c, false); } @@ -1984,37 +1984,61 @@ static Expr *parse_double(ParseContext *c, Expr *left, SourceSpan lhs_start) bool parse_joined_strings(ParseContext *c, const char **str_ref, size_t *len_ref) { + if (str_ref) *str_ref = NULL; const char *str = symstr(c); size_t len = c->data.strlen; advance_and_verify(c, TOKEN_STRING); - if (!str_ref) scratch_buffer_append(str); - // This is wasteful for adding many tokens together - // and can be optimized. + // Simple string optimization. + if (str_ref && c->tok != TOKEN_STRING) + { + *str_ref = str; + *len_ref = len; + return true; + } + // Now handle multiple strings + if (str_ref) + { + scratch_buffer_clear(); + } + scratch_buffer_append_len(str, len); + + // Skip EOL for contracts if (tok_is(c, TOKEN_DOCS_EOL) && peek(c) == TOKEN_STRING) advance(c); while (tok_is(c, TOKEN_STRING)) { // Grab the token. size_t next_len = c->data.strlen; + len += next_len; if (!next_len) { // Zero length so just continue. advance_and_verify(c, TOKEN_STRING); continue; } - if (!str_ref) + str = symstr(c); + // We might overrun the buffer with this, so then we need to do a copy. + if (!scratch_buffer_may_append(next_len)) { - scratch_buffer_append(symstr(c)); - } - else - { - // Create new string and copy. - char *buffer = malloc_string(len + next_len + 1); - memcpy(buffer, str, len); - memcpy(buffer + len, symstr(c), next_len); - len += next_len; - buffer[len] = '\0'; - str = buffer; + // If it is not imperative we keep it, we skip here. + if (!str_ref) goto ADVANCE; + if (!*str_ref) + { + *str_ref = scratch_buffer_copy(); + } + else + { + *str_ref = str_cat_len(*str_ref, len - scratch_buffer.len - next_len, scratch_buffer.str, scratch_buffer.len); + } + scratch_buffer_clear(); + // It might still overrun if it's too big: + if (!scratch_buffer_may_append(next_len)) + { + *str_ref = str_cat_len(*str_ref, len - next_len, str, next_len); + goto ADVANCE; + } } + scratch_buffer_append_len(str, next_len); +ADVANCE:; advance_and_verify(c, TOKEN_STRING); if (tok_is(c, TOKEN_DOCS_EOL) && peek(c) == TOKEN_STRING) advance(c); } @@ -2023,9 +2047,17 @@ bool parse_joined_strings(ParseContext *c, const char **str_ref, size_t *len_ref PRINT_ERROR_HERE("String exceeded max size."); return false; } + // If we don't keep it, we're done. if (!str_ref) return true; - ASSERT(str); - *str_ref = str; + + if (*str_ref) + { + *str_ref = str_cat_len(*str_ref, len - scratch_buffer.len, scratch_buffer.str, scratch_buffer.len); + } + else + { + *str_ref = scratch_buffer_copy(); + } *len_ref = len; return true; } @@ -2064,7 +2096,7 @@ static Expr *parse_bool(ParseContext *c, Expr *left, SourceSpan lhs_start) /** * Parse 'null', creating a const void* with zero address. */ -static Expr *parse_null(ParseContext *c, Expr *left, SourceSpan lhs_start) +static Expr *parse_null(ParseContext *c, Expr *left, SourceSpan lhs_start UNUSED) { ASSERT(!left && "Had left hand side"); Expr *number = EXPR_NEW_TOKEN(EXPR_CONST); diff --git a/src/utils/lib.h b/src/utils/lib.h index ec3243aa1..80add0754 100644 --- a/src/utils/lib.h +++ b/src/utils/lib.h @@ -44,7 +44,8 @@ typedef struct { char* cl_include_env; } WindowsSDK; -#define MAX_STRING_BUFFER 0x10000 +// Keep a 4 MB text buffer +#define MAX_STRING_BUFFER (1024 * 1024 * 4) #define COMPILER_SUCCESS_EXIT -1000 NORETURN void exit_compiler(int exit_value); extern jmp_buf on_err_jump; @@ -144,6 +145,7 @@ char *str_trim(char *str); const char *str_trim_start(const char *str); void str_trim_end(char *str); char *str_cat(const char *a, const char *b); +char *str_cat_len(const char *a, size_t a_len, const char *b, size_t b_len); // Search a list of strings and return the matching element or -1 if none found. int str_findlist(const char *value, unsigned count, const char** elements); // Sprintf style, saved to an arena allocated string @@ -172,6 +174,7 @@ void slice_trim(StringSlice *slice); void scratch_buffer_clear(void); void scratch_buffer_append(const char *string); void scratch_buffer_append_len(const char *string, size_t len); +bool scratch_buffer_may_append(size_t len); void scratch_buffer_append_char(char c); void scratch_buffer_append_in_quote(const char *string); void scratch_buffer_append_char_repeat(char c, size_t count); diff --git a/src/utils/stringutils.c b/src/utils/stringutils.c index 11eaf8894..09dbada47 100644 --- a/src/utils/stringutils.c +++ b/src/utils/stringutils.c @@ -340,6 +340,15 @@ char *str_cat(const char *a, const char *b) return buffer; } +char *str_cat_len(const char *a, size_t a_len, const char *b, size_t b_len) +{ + char *buffer = malloc_string(a_len + b_len + 1); + memcpy(buffer, a, a_len); + memcpy(buffer + a_len, b, b_len); + buffer[a_len + b_len] = '\0'; + return buffer; +} + char *str_dup(const char *str) { return str_copy(str, strlen(str)); @@ -358,9 +367,14 @@ void scratch_buffer_clear(void) scratch_buffer.len = 0; } +bool scratch_buffer_may_append(size_t len) +{ + return len + scratch_buffer.len < MAX_STRING_BUFFER - 1; +} + void scratch_buffer_append_len(const char *string, size_t len) { - if (len + scratch_buffer.len > MAX_STRING_BUFFER - 1) + if (!scratch_buffer_may_append(len)) { error_exit("Scratch buffer size (%d chars) exceeded", MAX_STRING_BUFFER - 1); }