Pimped Auto-Generated Code
Since Session 21 the source code for strTokenKind() and the enum constants for enum TokenKind are auto-generated. This approach can be extended. For parsing left associative binary operators auxiliary functions tokenKindPrec() and makeBinaryExprKind() were implemented manually in the last session. These functions also can be auto-generated.
But this is not the end of the story. Also code for recognizing tokens can be auto-generated. Tools like Flex can generate a complete lexer. Our pimped generator will not go that far. But it will generate code to recognize punctuators (e.g. '=', '==', etc.) and keywords (e.g. 'for', 'while', etc.).
Updated Makefile, tokenkind.txt and xgen_tokenkind.c
Here the files that needs to be replaced:
CPPFLAGS += -Wall -Wcast-qual
LDFLAGS += -lm
#
# patch: If user has not defined CC and default value does not exist use gcc
#
ifeq ($(origin CC),default)
cc_check := $(shell $(CC) -v > /dev/null 2>&1 && echo "sane")
ifneq ($(strip $(cc_check)),sane)
CC := gcc
endif
endif
#
# List of files that need to be generated before compilation and rules to
# generate them
#
generated_files := \
gen_makebinaryexprkind.c \
gen_strtokenkind.c \
gen_tokenkind.h \
gen_tokenkindprec.c \
gen_parsepunctuator.c \
gen_parsekeyword.c
$(generated_files) : tokenkind.txt xgen_tokenkind
./xgen_tokenkind $<
#
# Define list of source files, object files, targets, etc
#
# all source files
src :=\
$(filter-out gen_%,\
$(wildcard *.c))
# all object files
obj :=\
$(patsubst %.c,%.o,\
$(src))
# all targets (test programs)
target :=\
$(filter xtest%,\
$(patsubst %.c,%,\
$(src)))
# all generators for source files
generator :=\
$(filter xgen%,\
$(patsubst %.c,%,\
$(src)))
# objects that are required by the targets
lib.o :=\
$(filter-out xtest% xgen%,\
$(obj))
# dependency file that will be generated by compiler
deps :=\
$(patsubst %,%.d,\
$(src))
# dependency file leftovers of gone source files
obsolete.deps:=\
$(filter-out $(deps),\
$(wildcard *.c.d))
#
# Build rules
#
.PHONY: all
.DEFAULT_GOAL := all
all: $(target) $(obj) $(generator)
# rule for removing obsolete dependency files
.PHONY: $(obsolete.deps)
$(obsolete.deps) :
$(RM) $(obsolete.deps)
# delete implicit rule for building an executable directly from its source file
% : %.c
# rule for source file generators
xgen% : xgen%.c
$(CC) -o $@ $^ $(LDFLAGS)
# our rule: to build target link its object file against library object files
%: %.o $(lib.o) | $(obsolete.deps)
$(CC) -o $@ $^ $(LDFLAGS)
# our rule to build objects: also generate a dependency file
%.o: %.c | $(obsolete.deps) $(generated_files)
$(CC) -c $(CPPFLAGS) $(CFLAGS) -MT $@ -MMD -MP -MF $<.d $<
.PHONY: clean
clean:
$(RM) $(target) $(generator) $(obj) $(deps) $(obsolete.deps)
$(RM) $(generated_files)
#
# Include dependencies (if already generated)
#
-include $(deps)
#include <assert.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void
usage(const char *prg)
{
fprintf(stderr, "usage: %s tokenkind.txt\n", prg);
exit(1);
}
/*
* Similar to Perl: Chops of the trailing newline (actually '\n' on Unix and
* '\r\n' on DOS) by overwriting it with a null byte.
*/
void
chopNl(char *line, size_t len)
{
line[len - 2] == '\r' ? (line[len -2] = 0) : (line[len - 1] = 0);
}
bool
isSpace(int ch)
{
return ch == ' ' || ch == '\r' || ch == '\f' || ch == '\v' || ch == '\t';
}
static bool
isLetter(int ch)
{
return ((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A' && ch <= 'Z')) ||
ch == '_';
}
/*
* Split white space separated column with format
* [<tokenkind> [<tokenKindVal> [<prec> <exprKind]]]
*/
void
split(char *line, char **tokenKind, char **tokenKindVal, int *prec,
char **exprKind)
{
*tokenKind = line;
*tokenKindVal = 0;
*prec = 0;
for (; *line && !isSpace(*line); ++line) {
}
if (!isSpace(*line)) {
return;
}
*line = 0;
// advance to next column
while (isSpace(*++line)) {
}
if (!*line) {
fprintf(stderr, "%s: <tokenKindVal> field expected\n", *tokenKind);
exit(1);
}
*tokenKindVal = line;
for (; *line && !isSpace(*line); ++line) {
}
if (!isSpace(*line)) {
return;
}
*line = 0;
// advance to next column
while (isSpace(*++line)) {
}
if (sscanf(line, "%d", prec) != 1) {
fprintf(stderr, "%s: prec field expected\n", *tokenKind);
exit(1);
}
// advance to next column
for (; *line && !isSpace(*line); ++line) {
}
while (isSpace(*++line)) {
}
if (!*line) {
fprintf(stderr, "%s: expression kind expected\n", *tokenKind);
exit(1);
}
*exprKind = line;
}
void
printIndent(FILE *out, int indentLevel)
{
assert(indentLevel >= 0);
fprintf(out, "%*s", indentLevel * 4, "");
}
/*
* See https://cplusplus.com/reference/cstdarg/va_list/ for details on how
* variable arguments can be handled platform independent.
*/
void
printCode(FILE *out, int indentLevel, const char *fmt, ...)
{
printIndent(out, indentLevel);
if (!fmt) {
return;
}
va_list argp;
va_start(argp, fmt);
vfprintf(out, fmt, argp);
va_end(argp);
}
enum OutSelect {
OUT_TOKEN_KIND,
OUT_STR_TOKEN_KIND,
OUT_TOKEN_KIND_PREC,
OUT_MAKE_BINARY_EXPR_KIND,
OUT_PARSE_PUNCTUATOR,
OUT_PARSE_KEYWORD,
NUM_OUT,
};
const char *outFilename[NUM_OUT] = {
"gen_tokenkind.h",
"gen_strtokenkind.c",
"gen_tokenkindprec.c",
"gen_makebinaryexprkind.c",
"gen_parsepunctuator.c",
"gen_parsekeyword.c",
};
FILE *outFile[NUM_OUT];
void
closeOutFiles(void)
{
for (int i = 0; i < NUM_OUT; ++i) {
if (outFile[i]) {
fclose(outFile[i]);
}
}
}
void
openOutFiles(void)
{
for (int i = 0; i < NUM_OUT; ++i) {
outFile[i] = fopen(outFilename[i], "w");
if (!outFile[i]) {
fprintf(stderr, "can not open output file %s\n", outFilename[i]);
closeOutFiles();
exit(1);
}
}
}
void
printHeader(enum OutSelect outSelect)
{
FILE *out = outFile[outSelect];
switch (outSelect) {
case OUT_TOKEN_KIND:
printCode(out, 0, "enum TokenKind\n");
printCode(out, 0, "{\n");
return;
case OUT_STR_TOKEN_KIND:
printCode(out, 0, "const char *\n");
printCode(out, 0, "strTokenKind(enum TokenKind tokenKind)\n");
printCode(out, 0, "{\n");
printCode(out, 1, "switch (tokenKind) {\n");
return;
case OUT_TOKEN_KIND_PREC:
printCode(out, 0, "static int\n");
printCode(out, 0, "tokenKindPrec(enum TokenKind tokenKind)\n");
printCode(out, 0, "{\n");
printCode(out, 1, "switch (tokenKind) {\n");
return;
case OUT_MAKE_BINARY_EXPR_KIND:
printCode(out, 0, "static enum ExprKind\n");
printCode(out, 0, "makeBinaryExprKind(enum TokenKind tokenKind)\n");
printCode(out, 0, "{\n");
printCode(out, 1, "switch (tokenKind) {\n");
return;
case OUT_PARSE_PUNCTUATOR:
case OUT_PARSE_KEYWORD:
return;
default:
assert(0);
}
}
void
printFooter(enum OutSelect outSelect)
{
FILE *out = outFile[outSelect];
switch (outSelect) {
case OUT_TOKEN_KIND:
printCode(out, 0, "};\n");
return;
case OUT_STR_TOKEN_KIND:
printCode(out, 1, "default:\n");
printCode(out, 2, "fprintf(stderr, \"internal error in strTokenKind: "
"tokenKind = %%d\",\n");
printCode(out, 3, "tokenKind);\n");
printCode(out, 2, "finalizeExit(1);\n");
printCode(out, 2, "return \"\";\n");
printCode(out, 1, "}\n");
printCode(out, 0, "}\n");
return;
case OUT_TOKEN_KIND_PREC:
printCode(out, 1, "default:\n");
printCode(out, 2, "return 0;\n");
printCode(out, 1, "}\n");
printCode(out, 0, "}\n");
return;
case OUT_MAKE_BINARY_EXPR_KIND:
printCode(out, 1, "default:\n");
printCode(out, 2, "fprintf(stderr, \"internal error in "
"makeBinaryExprKind (tokenKind = %%d)\",\n");
printCode(out, 3, "tokenKind);\n");
printCode(out, 2, "finalizeExit(1);\n");
printCode(out, 2, "return 0;\n");
printCode(out, 1, "}\n");
printCode(out, 0, "}\n");
return;
case OUT_PARSE_PUNCTUATOR:
case OUT_PARSE_KEYWORD:
return;
default:
assert(0);
}
}
void
printHeaders(void)
{
for (int i = 0; i < NUM_OUT; ++i) {
printHeader(i);
}
}
void
printFooters(void)
{
for (int i = 0; i < NUM_OUT; ++i) {
printFooter(i);
}
}
void
printTokenKind(const char *tk)
{
FILE *out = outFile[OUT_TOKEN_KIND];
printCode(out, 1, "%s,\n", tk);
}
void
printStrTokenKind(const char *tk)
{
FILE *out = outFile[OUT_STR_TOKEN_KIND];
printCode(out, 1, "case %s:\n", tk);
printCode(out, 2, "return \"%s\";\n", tk);
}
void
printTokenKindPrec(const char *tk, int prec)
{
FILE *out = outFile[OUT_TOKEN_KIND_PREC];
printCode(out, 1, "case %s:\n", tk);
printCode(out, 2, "return %d;\n", prec);
}
void
printMakeBinaryExprKind(const char *tk, const char *exprKind)
{
FILE *out = outFile[OUT_MAKE_BINARY_EXPR_KIND];
printCode(out, 1, "case %s:\n", tk);
printCode(out, 2, "return %s;\n", exprKind);
}
//------------------------------------------------------------------------------
struct CharNode
{
struct CharNode *next[256];
char *tokenKind;
} lexTreePunctuator, lexTreeKeyword;
void
lexTreeAdd(const char *tk, const char *tkVal)
{
struct CharNode *n = isLetter(*tkVal)
? &lexTreeKeyword
: &lexTreePunctuator;
const char *s = tkVal;
for (size_t c; (c = *s++); n = n->next[c]) {
if (!n->next[c]) {
n->next[c] = calloc(1, sizeof(*n->next[c]));
if (!n->next[c]) {
fprintf(stderr, "lexTreeAdd: out of memeory\n");
exit(1);
}
}
}
n->tokenKind = strdup(tk);
}
void
lexTreeDestroy_(struct CharNode *n, struct CharNode *root)
{
for (int i = 0; i < sizeof(n->next) / sizeof(n->next[0]); ++i) {
if (n->next[i]) {
lexTreeDestroy_(n->next[i], root);
}
}
if (n->tokenKind) {
free(n->tokenKind);
}
if (n != root) {
free(n);
}
}
void
lexTreeDestroy(void)
{
lexTreeDestroy_(&lexTreePunctuator, &lexTreePunctuator);
lexTreeDestroy_(&lexTreeKeyword, &lexTreeKeyword);
}
void
printParsePunctuator_(FILE *out, const struct CharNode *n, int level)
{
int identLevel = level + 1;
for (size_t i = 0; i < sizeof(n->next) / sizeof(n->next[0]); ++i) {
if (!n->next[i]) {
continue;
}
printCode(out, identLevel, "%sif (ch == '%c') {\n",
level ? "" : "} else ", (char)i);
printCode(out, identLevel + 1, "appendCharToStr(&token.val, ch);\n");
printCode(out, identLevel + 1, "nextCh();\n");
printParsePunctuator_(out, n->next[i], level + 1);
if (level) {
printCode(out, identLevel, "}\n");
}
}
if (n->tokenKind) {
printCode(out, identLevel, "return token.kind = %s;\n", n->tokenKind);
}
}
void
printParsePunctuator(void)
{
printParsePunctuator_(outFile[OUT_PARSE_PUNCTUATOR], &lexTreePunctuator, 0);
}
void
printParseKeyword_(FILE *out, const struct CharNode *n, int level)
{
int identLevel = level + 1;
for (size_t i = 0; i < sizeof(n->next) / sizeof(n->next[0]); ++i) {
if (!n->next[i]) {
continue;
}
printCode(out, identLevel, "%sif (ch == '%c') {\n",
level ? "" : "} else ", (char)i);
printCode(out, identLevel + 1, "appendCharToStr(&token.val, ch);\n");
printCode(out, identLevel + 1, "nextCh();\n");
printParseKeyword_(out, n->next[i], level + 1);
if (level) {
printCode(out, identLevel, "}\n");
}
}
if (n->tokenKind) {
printCode(out, identLevel, "if (!isLetter(ch) || isDecDigit(ch)) {\n");
printCode(out, identLevel + 1, "return token.kind = %s;\n",
n->tokenKind);
printCode(out, identLevel, "}\n");
}
}
void
printParseKeyword(void)
{
printParseKeyword_(outFile[OUT_PARSE_KEYWORD], &lexTreeKeyword, 1);
}
//------------------------------------------------------------------------------
int
main(int argc, char *argv[])
{
if (argc != 2) {
usage(argv[0]);
}
FILE *in = fopen(argv[1], "r");
if (!in) {
fprintf(stderr, "can not open input file '%s'\n", argv[1]);
}
openOutFiles();
printHeaders();
char *line = 0;
size_t capacity = 0;
ssize_t len;
while ((len = getline(&line, &capacity, in)) > 0) {
chopNl(line, len);
char *tokenKind, *tokenKindVal, *exprKind;
int prec;
split(line, &tokenKind, &tokenKindVal, &prec, &exprKind);
if (!*line) {
continue;
}
if (tokenKind) {
printTokenKind(tokenKind);
printStrTokenKind(tokenKind);
}
if (tokenKindVal) {
lexTreeAdd(tokenKind, tokenKindVal);
}
if (prec) {
printTokenKindPrec(tokenKind, prec);
printMakeBinaryExprKind(tokenKind, exprKind);
}
}
free(line);
printParsePunctuator();
printParseKeyword();
printFooters();
fclose(in);
closeOutFiles();
lexTreeDestroy();
}
EOI
BAD_TOKEN
DEC_LITERAL
HEX_LITERAL
OCT_LITERAL
IDENTIFIER
AMPERSAND &
AMPERSAND2 &&
ASTERISK * 13 EK_MUL
CARET ^
DOLLAR $
EQUAL =
EQUAL2 == 9 EK_EQUAL
NOT !
NOT_EQUAL != 9 EK_NOT_EQUAL
GREATER > 10 EK_GREATER
GREATER_EQUAL >= 10 EK_GREATER_EQUAL
LBRACE {
LESS < 10 EK_LESS
LESS_EQUAL <= 10 EK_LESS_EQUAL
LPAREN (
MINUS - 12 EK_SUB
PERCENT % 13 EK_MOD
PLUS + 12 EK_ADD
RBRACE }
RPAREN )
SEMICOLON ;
SLASH / 13 EK_DIV
TILDE ~
VBAR |
VBAR2 ||
FOR for
WHILE while
DO do
IF if
ELSE else
In addition to gen_tokenkind.h and gen_strtokenkind.c also the files gen_parsepunctuator.c, gen_parsekeyword.c, gen_makebinaryexprkind.c and gen_tokenkindprec.c, will be generated.
Format of tokenkind.txt
The description in file tokenkind.txt now can have empty lines. These lines will be ignored. The field format can be described by
1 | [<tokenkind> [<tokenkindval> [<prec> <exprkind>]]]
|
where brackets indicate that its content is optional.
Field <tokenkind>
As before the first field of a line contains the identifier of a token. From this field the enum constants in gen_tokenkind.h
enum TokenKind
{
EOI,
BAD_TOKEN,
DEC_LITERAL,
HEX_LITERAL,
OCT_LITERAL,
IDENTIFIER,
AMPERSAND,
AMPERSAND2,
ASTERISK,
CARET,
DOLLAR,
EQUAL,
EQUAL2,
NOT,
NOT_EQUAL,
GREATER,
GREATER_EQUAL,
LBRACE,
LESS,
LESS_EQUAL,
LPAREN,
MINUS,
PERCENT,
PLUS,
RBRACE,
RPAREN,
SEMICOLON,
SLASH,
TILDE,
VBAR,
VBAR2,
FOR,
WHILE,
DO,
IF,
ELSE,
};
and the implementation of strTokenKind() is generated
const char *
strTokenKind(enum TokenKind tokenKind)
{
switch (tokenKind) {
case EOI:
return "EOI";
case BAD_TOKEN:
return "BAD_TOKEN";
case DEC_LITERAL:
return "DEC_LITERAL";
case HEX_LITERAL:
return "HEX_LITERAL";
case OCT_LITERAL:
return "OCT_LITERAL";
case IDENTIFIER:
return "IDENTIFIER";
case AMPERSAND:
return "AMPERSAND";
case AMPERSAND2:
return "AMPERSAND2";
case ASTERISK:
return "ASTERISK";
case CARET:
return "CARET";
case DOLLAR:
return "DOLLAR";
case EQUAL:
return "EQUAL";
case EQUAL2:
return "EQUAL2";
case NOT:
return "NOT";
case NOT_EQUAL:
return "NOT_EQUAL";
case GREATER:
return "GREATER";
case GREATER_EQUAL:
return "GREATER_EQUAL";
case LBRACE:
return "LBRACE";
case LESS:
return "LESS";
case LESS_EQUAL:
return "LESS_EQUAL";
case LPAREN:
return "LPAREN";
case MINUS:
return "MINUS";
case PERCENT:
return "PERCENT";
case PLUS:
return "PLUS";
case RBRACE:
return "RBRACE";
case RPAREN:
return "RPAREN";
case SEMICOLON:
return "SEMICOLON";
case SLASH:
return "SLASH";
case TILDE:
return "TILDE";
case VBAR:
return "VBAR";
case VBAR2:
return "VBAR2";
case FOR:
return "FOR";
case WHILE:
return "WHILE";
case DO:
return "DO";
case IF:
return "IF";
case ELSE:
return "ELSE";
default:
fprintf(stderr, "internal error in strTokenKind: tokenKind = %d",
tokenKind);
finalizeExit(1);
return "";
}
}
Field <tokenkindval>
From this field code for detecting punctuator and keyword tokens gets generated.
If the first character is not a letter (i.e. 'a', ..., 'z', 'A', ..., 'Z' or '_') it is considered as a punctuator. For the lexer it generates the following code fragment for parsing these punctuators (this should look familiar):
} else if (ch == '!') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == '=') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = NOT_EQUAL;
}
return token.kind = NOT;
} else if (ch == '$') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = DOLLAR;
} else if (ch == '%') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = PERCENT;
} else if (ch == '&') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == '&') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = AMPERSAND2;
}
return token.kind = AMPERSAND;
} else if (ch == '(') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = LPAREN;
} else if (ch == ')') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = RPAREN;
} else if (ch == '*') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = ASTERISK;
} else if (ch == '+') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = PLUS;
} else if (ch == '-') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = MINUS;
} else if (ch == '/') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = SLASH;
} else if (ch == ';') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = SEMICOLON;
} else if (ch == '<') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == '=') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = LESS_EQUAL;
}
return token.kind = LESS;
} else if (ch == '=') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == '=') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = EQUAL2;
}
return token.kind = EQUAL;
} else if (ch == '>') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == '=') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = GREATER_EQUAL;
}
return token.kind = GREATER;
} else if (ch == '^') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = CARET;
} else if (ch == '{') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = LBRACE;
} else if (ch == '|') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == '|') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = VBAR2;
}
return token.kind = VBAR;
} else if (ch == '}') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = RBRACE;
} else if (ch == '~') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = TILDE;
Otherwise the field describes a keyword. In the last session parsed identifiers were compared against a list of reserved strings to check if they are actually keywords. For handwritten code this is a reasonable approach. With generated code this can be done more efficient. For the lexer the following code gets generated to detect keywords:
if (ch == 'd') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'o') {
appendCharToStr(&token.val, ch);
nextCh();
if (!isLetter(ch) || isDecDigit(ch)) {
return token.kind = DO;
}
}
}
if (ch == 'e') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'l') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 's') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'e') {
appendCharToStr(&token.val, ch);
nextCh();
if (!isLetter(ch) || isDecDigit(ch)) {
return token.kind = ELSE;
}
}
}
}
}
if (ch == 'f') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'o') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'r') {
appendCharToStr(&token.val, ch);
nextCh();
if (!isLetter(ch) || isDecDigit(ch)) {
return token.kind = FOR;
}
}
}
}
if (ch == 'i') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'f') {
appendCharToStr(&token.val, ch);
nextCh();
if (!isLetter(ch) || isDecDigit(ch)) {
return token.kind = IF;
}
}
}
if (ch == 'w') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'h') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'i') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'l') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'e') {
appendCharToStr(&token.val, ch);
nextCh();
if (!isLetter(ch) || isDecDigit(ch)) {
return token.kind = WHILE;
}
}
}
}
}
}
Fields <prec> and <exprkind>
Either both fields have to be present or none. From these fields tokenKindPrec() in gen_tokenkindprec.c and makeBinaryExprKind() in gen_makebinaryexprkind.c get generated:
static int
tokenKindPrec(enum TokenKind tokenKind)
{
switch (tokenKind) {
case ASTERISK:
return 13;
case EQUAL2:
return 9;
case NOT_EQUAL:
return 9;
case GREATER:
return 10;
case GREATER_EQUAL:
return 10;
case LESS:
return 10;
case LESS_EQUAL:
return 10;
case MINUS:
return 12;
case PERCENT:
return 13;
case PLUS:
return 12;
case SLASH:
return 13;
default:
return 0;
}
}
static enum ExprKind
makeBinaryExprKind(enum TokenKind tokenKind)
{
switch (tokenKind) {
case ASTERISK:
return EK_MUL;
case EQUAL2:
return EK_EQUAL;
case NOT_EQUAL:
return EK_NOT_EQUAL;
case GREATER:
return EK_GREATER;
case GREATER_EQUAL:
return EK_GREATER_EQUAL;
case LESS:
return EK_LESS;
case LESS_EQUAL:
return EK_LESS_EQUAL;
case MINUS:
return EK_SUB;
case PERCENT:
return EK_MOD;
case PLUS:
return EK_ADD;
case SLASH:
return EK_DIV;
default:
fprintf(stderr, "internal error in makeBinaryExprKind (tokenKind = %d)",
tokenKind);
finalizeExit(1);
return 0;
}
}
Updating the Parser
In the parser the implementation of tokenKindPrec() and makeBinaryExprKind() now simply gets include:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | /*
* static int tokenKindPrec(enum TokenKind kind);
*
* Returns 0 if kind is not a left associative binary operator.
* Otherwise returns a precedence > 0
*/
#include "gen_tokenkindprec.c"
/*
* enum ExprKind makeBinaryExprKind(enum TokenKind kind);
*
* For left associative binary operators translates 'enum TokenKind' into
* 'enum ExprKind'
*/
#include "gen_makebinaryexprkind.c"
const struct Expr *
parseLeftAssocBinaryExpr(int prec)
{
/* ... as before ... */
}
|
Updating the Lexer
Function getToken(void) has the following structure:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | enum TokenKind
getToken(void)
{
/* ... */
if (ch == EOF) {
return token.kind = EOI;
} else if (isDecDigit(ch)) {
// parse literal
/* ... */
// parsing punctuators
} else if (ch == '&') {
/* ... */
// parsing keywords and identifiers
} else if (isLetter(ch)) {
do {
appendCharToStr(&token.val, ch);
nextCh();
} while (isLetter(ch) || isDecDigit(ch));
return token.kind = checkForKeyword(token.val.cstr);
}
nextCh();
return token.kind = BAD_TOKEN;
}
|
The part for parsing the punctuators can simply be included. For detecting keywords function checkForKeyword() is no longer needed. Instead first the code in gen_parsekeyword.c is used to detect keywords. If a keyword is found function getToken() returns. Hence, only if the code afterwards is reached the is an identifier in the input stream. With a while loop the remaining part of the identifier gets collected:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | enum TokenKind
getToken(void)
{
/* ... */
if (ch == EOF) {
return token.kind = EOI;
} else if (isDecDigit(ch)) {
// parse unsigned integer literal
/* ... */
// parsing punctuators
#include "gen_parsepunctuator.c"
// parsing keywords and identifiers
} else if (isLetter(ch)) {
// First detected keywords ...
#include "gen_parsekeyword.c"
// ... if there was no keyword detected it is an identifier
while (isLetter(ch) || isDecDigit(ch)) {
appendCharToStr(&token.val, ch);
nextCh();
}
return token.kind = IDENTIFIER;
}
nextCh();
return token.kind = BAD_TOKEN;
}
|
Here the complete updated implementation of lexer.c:
#include <stdbool.h>
#include <stdio.h>
#include "finalize.h"
#include "lexer.h"
#include "ustr.h"
struct Token token;
static void
cleanup(void)
{
releaseStr(&token.val);
}
//------------------------------------------------------------------------------
// position of current character ch
static struct TokenPos curr = {
1,
0,
};
static int ch;
static int
nextCh(void)
{
++curr.col;
ch = getchar();
if (ch == '\n') {
++curr.line;
curr.col = 0;
}
return ch;
}
static bool
isWhiteSpace(int ch)
{
return ch == ' ' || ch == '\t';
}
static bool
isDecDigit(int ch)
{
return ch >= '0' && ch <= '9';
}
static bool
isOctDigit(int ch)
{
return ch >= '0' && ch <= '7';
}
static bool
isHexDigit(int ch)
{
return isDecDigit(ch) || (ch >= 'a' && ch <= 'f') ||
(ch >= 'A' && ch <= 'F');
}
static bool
isLetter(int ch)
{
return ((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A' && ch <= 'Z')) ||
ch == '_';
}
enum TokenKind
getToken(void)
{
static bool first = true;
if (first) {
first = false;
finalizeRegister(cleanup);
}
// init ch, skip white spaces and newlines
while (ch == 0 || isWhiteSpace(ch) || ch == '\n') {
nextCh();
}
token.pos.line = curr.line;
token.pos.col = curr.col;
clearStr(&token.val);
if (ch == EOF) {
return token.kind = EOI;
} else if (isDecDigit(ch)) {
// parse literal
if (ch == '0') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'x') {
appendCharToStr(&token.val, ch);
nextCh();
if (isHexDigit(ch)) {
while (isHexDigit(ch)) {
appendCharToStr(&token.val, ch);
nextCh();
}
return token.kind = HEX_LITERAL;
}
return token.kind = BAD_TOKEN;
}
while (isOctDigit(ch)) {
appendCharToStr(&token.val, ch);
nextCh();
}
return token.kind = OCT_LITERAL;
} else if (isDecDigit(ch)) {
while (isDecDigit(ch)) {
appendCharToStr(&token.val, ch);
nextCh();
}
return token.kind = DEC_LITERAL;
}
// parsing punctuators
#include "gen_parsepunctuator.c"
// parsing keywords and identifiers
} else if (isLetter(ch)) {
// First detected keywords ...
#include "gen_parsekeyword.c"
// ... if there was no keyword detected it is an identifier
while (isLetter(ch) || isDecDigit(ch)) {
appendCharToStr(&token.val, ch);
nextCh();
}
return token.kind = IDENTIFIER;
}
nextCh();
return token.kind = BAD_TOKEN;
}