Files
c3c/resources/examples/acornvm/lexer.c3

692 lines
18 KiB
Plaintext

module acorn::lex;
/** Lexer for Acorn compiler
*
* @file
*
* This source file is part of avm - Acorn Virtual Machine.
* See Copyright Notice in avm.h
*/
/**
* Crude algorithm for determining if character is a Unicode letter
*/
func bool isualpha(Auchar c) @inline
{
return c > 0xA0 || isalpha(c);
}
/**
* Algorithm for determining if character is a digit 0-9
*/
func bool isudigit(Auchar c) @inline
{
return c >= '0' && c <= '9';
}
/**
* Return a new LexInfo value, lexer context for a source program
*/
func Value new(Value th, Value *dest, Value src, Value url)
{
LexInfo *lex;
// Create an lexer object
lex = mem::new(th, LexEnc, sizeof(LexInfo));
// Values
lex.token = aNull;
lex.th = th;
lex.source = src;
mem::markChk(th, lex, src);
lex.url = url;
mem::markChk(th, lex, url);
// Position info (ignoring initial UTF8 byte-order mark)
// TODO
lex.bytepos = lex.linebeg = getSize(src) >= 3 && 0 == strncmp("\xEF\xBB\xBF", toStr(src), 3) ? 3 : 0;
lex.linenbr = 1;
// indent state
lex.curindent = lex.newindent = 0;
lex.newline = false;
lex.newprogram = true;
lex.insertSemi = false;
lex.undentcont = false;
lex.optype = 0;
return *dest = cast(lex, Value);;
}
/** Return the current unicode character whose UTF-8 bytes start at lex->bytepos */
func Auchar LexInfo.thischar(LexInfo* lex)
{
byte *src = &toStr(lex.source)[lex.bytepos];
int nbytes;
Auchar chr;
// Get info from first UTF-8 byte
if ((*src&0xF0) == 0xF0) { nbytes=4; chr = *src&0x07;}
else if ((*src&0xE0) == 0xE0) {nbytes=3; chr = *src&0x0F;}
else if ((*src&0xC0) == 0xC0) {nbytes=2; chr = *src&0x1F;}
else if ((*src&0x80) == 0x00) {nbytes=1; chr = *src&0x7F;}
else {nbytes=1; chr = 0;} // error
// Obtain remaining bytes
while (--nbytes)
{
src++;
if (*src & 0xC0 ==0x80) chr = chr << 6 + *src & 0x3F;
}
return chr;
}
/** Return the current unicode character whose UTF-8 bytes start at lex->bytepos */
func Auchar LexInfo.nextchar(LexInfo* lex)
{
const char *src = &toStr(lex->source)[lex->bytepos];
int nbytes;
Auchar chr;
// Skip past current character
if ((*src&0xF0) == 0xF0) {nbytes=4;}
else if ((*src&0xE0) == 0xE0) {nbytes=3;}
else if ((*src&0xC0) == 0xC0) {nbytes=2;}
else if ((*src&0x80) == 0x00) {nbytes=1;}
else {nbytes=1;} // error
src += nbytes;
// Get info from first UTF-8 byte
if ((*src&0xF0) == 0xF0) {nbytes=4; chr = *src&0x07;}
else if ((*src&0xE0) == 0xE0) {nbytes=3; chr = *src&0x0F;}
else if ((*src&0xC0) == 0xC0) {nbytes=2; chr = *src&0x1F;}
else if ((*src&0x80) == 0x00) {nbytes=1; chr = *src&0x7F;}
else {nbytes=1; chr = 0;} // error
// Obtain remaining bytes
while (--nbytes) {
src++;
if ((*src&0xC0)==0x80)
chr = (chr<<6) + (*src&0x3F);
}
return chr;
}
/** Skip lex->bytepos past the unicode character whose UTF-8 bytes start at lex->bytepos */
func void LexInfo.skipchar(LexInfo* lex)
{
const char *src = &toStr(lex->source)[lex->bytepos];
int nbytes;
if (*src=='\0')
return;
// Get character size from first byte
if ((*src&0xF0) == 0xF0) {nbytes=4;}
else if ((*src&0xE0) == 0xE0) {nbytes=3;}
else if ((*src&0xC0) == 0xC0) {nbytes=2;}
else if ((*src&0x80) == 0x00) {nbytes=1;}
else {nbytes=1;} // error
lex->bytepos += nbytes;
}
/** Return true if at end of source */
#define lex_isEOF(lex) (lex_thischar(lex) == '\0')
/** Scan past non-tokenized white space.
* Handle line indentation and continuation */
func bool LexInfo.scanWhite(LexInfo *lex)
{
Value th = lex.th; // for vmlit
// Insert semicolon as a token, if requested by implied closing brace
if (lex.insertSemi)
{
lex.insertSemi = false;
lex.toktype=Res_Token;
lex.token=vmlit(SYM_SEMICOLON);
return true;
}
// Ignore all forms of white space
Auchar chr;
bool lookForWhiteSpace = true;
while (lookForWhiteSpace) {
switch (chr=lex_thischar(lex)) {
// Skip past spaces and tabs
case ' ':
case '\t':
case '\r':
lex_skipchar(lex);
break;
// Skip past new line
case '\n':
lex->linenbr++;
lex->linebeg = lex->bytepos;
lex->newline = true;
lex_skipchar(lex);
// Count line-leading tabs
lex->newindent = 0;
while (lex_thischar(lex)=='\t') {
lex->newindent++;
lex_skipchar(lex);
}
// Handle continuation.
if (lex_thischar(lex)=='\\') {
// Undenting requires we spawn some semi-colons and right braces
if (lex->newindent < lex->curindent)
lex->undentcont = true;
else {
lex->newline = false;
// Pretend indent did not change for extra-indented continuation
if (lex->newindent > lex->curindent)
lex->newindent = lex->curindent;
}
lex_skipchar(lex);
}
break;
// Skip comment starting with '#' until end of line
case '#':
{
const char *scanp = &toStr(lex->source)[lex->bytepos];
if (strncmp("###", scanp, 3)) {
// Inline comment skips to end of line
while (!lex_isEOF(lex) && lex_thischar(lex)!='\n')
lex_skipchar(lex);
break;
}
// Multi-line comment goes until next '###'
scanp+=3;
while (*scanp && 0!=strncmp("###", scanp, 3)) {
if (*scanp=='\n')
lex->linenbr++;
scanp++;
}
if (*scanp)
scanp+=3;
lex->bytepos += scanp - &toStr(lex->source)[lex->bytepos];
}
break;
default:
lookForWhiteSpace = false;
break;
}
}
// Mark start of a real token
lex->tokbeg = lex->bytepos;
lex->toklinepos = lex->tokbeg - lex->linebeg;
lex->tokline = lex->linenbr;
// We now know the next character starts a real token
// But first, we must handle insertion of ; { and } characters
// depending on the indentation changes and newline flag
// Handle increasing indentation
if (lex->newindent > lex->curindent) {
lex->toktype=Res_Token;
lex->token=vmlit(SymLBrace);
lex->curindent++;
lex->newline = false;
return true;
}
// Do not generate leading ';'
if (lex->newprogram)
lex->newprogram = lex->newline = false;
// End previous line's statement with a ';'
if (lex->newline) {
lex->toktype=Res_Token;
lex->token=vmlit(SymSemicolon);
lex->newline = false;
return true;
}
// Ensure end-of-file flushes all indent levels to 0
if (lex_isEOF(lex))
lex->newindent = 0;
// Handle decreasing indentation
if (lex->newindent < lex->curindent) {
lex->toktype=Res_Token;
lex->token=vmlit(SymRBrace);
lex->curindent--;
if (lex->undentcont && lex->newindent==lex->curindent)
lex->undentcont = false; // Continued line at right indent now. No semi-colon.
else
lex->insertSemi = true; // Insert semi-colon after implied closing brace
return true;
}
return false;
}
/** End of source program is a token */
bool lexScanEof(LexInfo *lex) {
if (!lex_isEOF(lex))
return false;
lex->toktype = Eof_Token;
return true;
}
/** Tokenize an integer or floating point number */
bool lexScanNumber(LexInfo *lex) {
// A number token's first character is always 0-9
// We cannot handle negative sign here, as it might be a subtraction
if (!isudigit(lex_thischar(lex)))
return false;
int base = 10;
bool exp = false;
int digval = 0;
long nbrval = 0;
// A leading zero may indicate a non-base 10 number
if (lex_thischar(lex)=='0') {
lex_skipchar(lex);
if (toupper(lex_thischar(lex))=='X') {base = 16; lex_skipchar(lex);}
// else if (toupper(lex_thischar(lex))=='B') {base = 2; lex_skipchar(lex);}
else if (toupper(lex_thischar(lex))=='.') {base = -1; lex_skipchar(lex);}
// else base = 8;
}
// Validate and process remaining numeric digits
while (1) {
// Handle characters in a suspected integer
if (base>0) {
// Decimal point means it is floating point after all
if (base==10 && lex_thischar(lex)=='.') {
// If next character is a symbol/range, treat '.' as method operator instead
Auchar nchr = lex_nextchar(lex);
if (isualpha(nchr) || nchr=='_' || nchr=='$' || nchr=='(' || nchr=='\'' || nchr=='.')
break;
lex_skipchar(lex);
base = -1;
continue;
}
// Extract a number digit value from the character
if (isudigit(lex_thischar(lex)))
digval = lex_thischar(lex)-'0';
else if (isalpha(lex_thischar(lex)))
digval = toupper(lex_thischar(lex))-'A'+10;
else
break;
// Ensure digit is within base, then process
if (digval>=base)
break;
nbrval = nbrval*base + digval;
lex_skipchar(lex);
}
// Validate characters in a floating point number
else {
// Only one exponent allowed
if (!exp && toupper(lex_thischar(lex))=='E') {
exp = true;
lex_skipchar(lex);
if (lex_thischar(lex)=='-')
lex_skipchar(lex);
continue;
}
if (!isudigit(lex_thischar(lex)))
break;
lex_skipchar(lex);
}
}
// Set value and type
if (base>=0) {
lex->token = anInt(nbrval);
lex->toktype = Lit_Token;
}
else {
lex->token = aFloat((Afloat) atof(&toStr(lex->source)[lex->tokbeg]));
lex->toktype = Lit_Token;
}
return true;
}
/** List of all reserved names (excluding literals) */
static VmLiterals ReservedNames[] = {
SymAnd,
SymAsync,
SymBaseurl,
SymBreak,
SymContext,
SymContinue,
SymDo,
SymEach,
SymElse,
SymElif,
SymIf,
SymIn,
SymInto,
SymLocal,
SymMatch,
SymNot,
SymOr,
SymReturn,
SymSelf,
SymSelfMeth,
SymThis,
SymUsing,
SymWait,
SymWhile,
SymWith,
SymYield
};
/** Tokenize a name. The result could be Name_Token (e.g., for variables)
* Res_Token, a reserved keyword, or Lit_Token for null, false and true. */
bool lexScanName(LexInfo *lex) {
// Name token's first character is always a-z, _ or $
Auchar chr = lex_thischar(lex);
if (!(isualpha(chr) || chr=='_' || chr=='$'))
return false;
// Walk through all valid characters in name
lex_skipchar(lex);
while ((chr=lex_thischar(lex))=='_' || chr=='$' || isudigit(chr) || isualpha(chr))
lex_skipchar(lex);
// Allow ? as trailing character
if (chr=='?')
lex_skipchar(lex);
// Create name token as a symbol
newSym(lex->th, &lex->token, &toStr(lex->source)[lex->tokbeg], lex->bytepos - lex->tokbeg);
mem_markChk(lex->th, lex, lex->token);
// If it is a reserved name for a literal, say so.
Value th = lex->th;
lex->toktype = Lit_Token;
if (lex->token == vmlit(SymNull)) {lex->token = aNull; return true;}
else if (lex->token == vmlit(SymFalse)) {lex->token = aFalse; return true;}
else if (lex->token == vmlit(SymTrue)) {lex->token = aTrue; return true;}
// If it is a reserved name, set toktype to say so
VmLiterals *vmtblendp = &ReservedNames[sizeof(ReservedNames)/sizeof(VmLiterals)];
for (VmLiterals *vmtblp = ReservedNames; vmtblp<vmtblendp; vmtblp++) {
if (lex->token == vmlit(*vmtblp)) {
lex->toktype = Res_Token;
return true;
}
}
lex->toktype = Name_Token;
return true;
}
/** Tokenize a string (double quotes) or symbol (single quotes)
* Handle escape sequences. Ignore line-end and leading tabs for multi-line. */
bool lexScanString(LexInfo *lex) {
// String token's first character should be a quote mark
Auchar quotemark = lex_thischar(lex);
if (!(quotemark=='"' || quotemark=='\'' ))
return false;
lex_skipchar(lex);
// Create a string value to place the contents into
const char *begp = &toStr(lex->source)[lex->bytepos];
const char *scanp = strchr(begp, quotemark); // An estimate, as it may not be the end
Value buildstr = pushStringl(lex->th, aNull, NULL, scanp==NULL? strlen(begp) : scanp-begp);
// Repetitively scan source looking for various delimiters
scanp = begp;
while (*scanp && *scanp!=quotemark) {
// Process any escape sequences within the string
if (*scanp=='\\') {
// Copy over string segment up to the escape sequence
if (scanp-begp > 0)
strAppend(lex->th, buildstr, begp, scanp-begp);
// Process escape sequence
switch (*++scanp) {
case 'n': strAppend(lex->th, buildstr, "\n", 1); scanp++; break;
case 'r': strAppend(lex->th, buildstr, "\r", 1); scanp++; break;
case 't': strAppend(lex->th, buildstr, "\t", 1); scanp++; break;
case 'u': case 'U':
{
// Convert a hexadecimal string of cnt digits to a unicode character
Auchar unichar=0;
int cnt = *scanp=='u'? 4 :8;
if (*(scanp+1)=='+')
scanp++;
while (*++scanp && cnt--) {
if (isudigit(*scanp))
unichar = unichar*16 + *scanp -'0';
if (isalpha(*scanp) && toupper(*scanp)<='F')
unichar = unichar*16 + toupper(*scanp)-'A'+10;
}
// Encode an unicode character into UTF-8 bytes
char utf8str[8];
char *utfp=&utf8str[sizeof(utf8str)-1];
*utfp-- = '\0'; // make it a sizeable string
if (unichar < 0x7f) {
*utfp = (char)unichar;
strAppend(lex->th, buildstr, utfp, 1);
}
else {
// multi-byte encoding, byte by byte backwards
int cnt=0;
while (unichar) {
cnt++;
char byt = unichar & 0x3f;
unichar = unichar >> 6;
// Put appropriate flags if it is the first byte
if (unichar==0) {
switch (cnt) {
case 2: *utfp = byt | 0xC0; break;
case 3: *utfp = byt | 0xE0; break;
case 4: *utfp = byt | 0xF0; break;
case 5: *utfp = byt | 0xF8; break;
case 6: *utfp = byt | 0xFC; break;
}
}
else
*utfp-- = byt | 0x80;
}
strAppend(lex->th, buildstr, utfp, cnt);
}
}
break;
default: strAppend(lex->th, buildstr, scanp, 1); scanp++; break;
}
begp=scanp;
}
// Ignore line end and line leading tabs
else if (*scanp=='\r' || *scanp=='\n') {
// Copy over string segment up to the escape sequence
if (scanp-begp > 0)
strAppend(lex->th, buildstr, begp, scanp-begp);
// Ignore line end and leading tabs
while (*scanp=='\r' || *scanp=='\n' || *scanp=='\t') {
if (*scanp=='\n')
lex->linenbr++;
scanp++;
}
begp=scanp;
}
// Otherwise process rest of string
else
scanp++;
}
// Copy over rest of string segment
if (scanp-begp > 0)
strAppend(lex->th, buildstr, begp, scanp-begp);
// Update lex position
if (*scanp==quotemark)
*scanp++;
lex->bytepos += scanp - &toStr(lex->source)[lex->bytepos];
// Create string (or symbol)
lex->toktype = Lit_Token;
if (quotemark=='"')
lex->token = buildstr;
else
newSym(lex->th, &lex->token, toStr(buildstr), getSize(buildstr));
mem_markChk(lex->th, lex, lex->token);
popValue(lex->th); // buildstr
return true;
}
/** Tokenize a punctuation-oriented operator symbol.
* By this point we take at least one character, unless multi-char op is recognized. */
bool lexScanResource(LexInfo *lex) {
if (lex_thischar(lex)!='@')
return false;
Value th = lex->th;
lex_skipchar(lex);
Auchar delim = lex_thischar(lex);
if (delim=='\'' || delim=='"' || delim=='(' || delim<=' ') {
lex->token = vmlit(SymAt);
lex->toktype = Res_Token;
return true;
}
// Mark beginning and look for end of url
const char *begp = &toStr(lex->source)[lex->bytepos];
const char *scanp = begp;
while ((unsigned char)(*++scanp)>' '); // end with space, tab, cr, lf, eof, etc.
lex->bytepos += scanp - begp;
// Create +Resource from literal url, and return it as token
pushValue(th, vmlit(SymNew));
pushValue(th, vmlit(TypeResc));
pushStringl(th, aNull, begp, scanp-begp);
pushValue(th, lex->url);
getCall(th, 3, 1);
lex->token = getFromTop(th, 0);
mem_markChk(lex->th, lex, lex->token);
popValue(th);
lex->toktype = Url_Token;
return true;
}
/** Tokenize a punctuation-oriented operator symbol.
* By this point we take at least one character, unless multi-char op is recognized. */
bool lexScanOp(LexInfo *lex) {
const char *begp = &toStr(lex->source)[lex->bytepos];
Auchar ch1 = lex_thischar(lex);
lex_skipchar(lex);
Auchar ch2 = lex_thischar(lex);
// Look for 2- and 3- character combos
if (ch1=='.' && ch2=='.') {
if ('.'==lex_nextchar(lex)) lex_skipchar(lex);
lex_skipchar(lex);
} else if (ch1=='=' && ch2=='=') {
if ('='==lex_nextchar(lex)) lex_skipchar(lex);
lex_skipchar(lex);
} else if (ch1=='<' && ch2=='=') {
if ('>'==lex_nextchar(lex)) lex_skipchar(lex);
lex_skipchar(lex);
} else if ((ch1=='>' && ch2=='=')
|| (ch1=='!' && ch2=='=')
|| (ch1=='~' && ch2=='~')
|| (ch1=='<' && ch2=='<')
|| (ch1=='>' && ch2=='>')
|| (ch1=='+' && ch2=='=')
|| (ch1=='-' && ch2=='=')
|| (ch1=='*' && ch2=='=')
|| (ch1=='/' && ch2=='=')
|| (ch1=='.' && ch2==':')
|| (ch1==':' && ch2==':')
|| (ch1==':' && ch2=='=')
|| (ch1=='&' && ch2=='&')
|| (ch1=='|' && ch2=='|')
|| (ch1=='*' && ch2=='*')
|| (ch1=='.' && ch2=='&')
|| (ch1=='+' && ch2=='[')
|| (ch1=='*' && ch2=='[')
) lex_skipchar(lex);
newSym(lex->th, &lex->token, begp, &toStr(lex->source)[lex->bytepos]-begp);
mem_markChk(lex->th, lex, lex->token);
lex->toktype = Res_Token;
return true;
}
/* Get the next token */
func void LexInfo.getNextToken(LexInfo *lex)
{
// Scan until we find a token
(!lex.scanWhite()
&& !lex.scanEof()
&& !lex.scanNumber()
&& !lex.scanName()
&& !lex.scanString()
&& !lex.scanResource()
&& !lex.scanOp());
#ifdef COMPILERLOG
switch (lex->toktype) {
case Lit_Token: {
pushSerialized(lex->th, lex->token);
vmLog("Literal token: %s", toStr(getFromTop(lex->th, 0)));
popValue(lex->th);
} break;
case Url_Token: {
pushSerialized(lex->th, lex->token);
vmLog("Literal url token: %s", toStr(getFromTop(lex->th, 0)));
popValue(lex->th);
} break;
case Name_Token: {
pushSerialized(lex->th, lex->token);
vmLog("Name token: %s", toStr(getFromTop(lex->th, 0)));
popValue(lex->th);
} break;
case Res_Token: {
pushSerialized(lex->th, lex->token);
vmLog("Reserved token: %s", toStr(getFromTop(lex->th, 0)));
popValue(lex->th);
} break;
}
#endif
}
/* Match current token to a reserved symbol. */
bool lexMatch(LexInfo *lex, const char *sym) {
return (lex->toktype==Res_Token && 0==strcmp(sym, toStr(lex->token)));
}
/* Match current token to a reserved symbol.
* If it matches, advance to the next token */
bool lexMatchNext(LexInfo *lex, const char *sym) {
if (lex->toktype==Res_Token && 0==strcmp(sym, toStr(lex->token))) {
lexGetNextToken(lex);
return true;
}
return false;
}
/* Log an compiler message */
void lexLog(LexInfo *lex, const char *msg) {
vmLog("While compiling %s(%d:%d): %s", toStr(lex->url), lex->tokline, lex->toklinepos, msg);
}
#ifdef __cplusplus
} // extern "C"
} // namespace avm
#endif