Pimped Auto-Generated Code

Since Session 21 the source code for strTokenKind() and the enum constants for enum TokenKind are auto-generated. This approach can be extended. For parsing left associative binary operators auxiliary functions tokenKindPrec() and makeBinaryExprKind() were implemented manually in the last session. These functions also can be auto-generated.

But this is not the end of the story. Also code for recognizing tokens can be auto-generated. Tools like Flex can generate a complete lexer. Our pimped generator will not go that far. But it will generate code to recognize punctuators (e.g. '=', '==', etc.) and keywords (e.g. 'for', 'while', etc.).

Updated Makefile, tokenkind.txt and xgen_tokenkind.c

Here the files that needs to be replaced:

CPPFLAGS += -Wall -Wcast-qual
LDFLAGS += -lm
#
# patch: If user has not defined CC and default value does not exist use gcc
#
ifeq ($(origin CC),default)
    cc_check := $(shell $(CC) -v > /dev/null 2>&1 && echo "sane")
    ifneq ($(strip $(cc_check)),sane)
        CC := gcc
    endif
endif

#
# List of files that need to be generated before compilation and rules to
# generate them
#

generated_files := \
    gen_makebinaryexprkind.c \
    gen_strtokenkind.c \
    gen_tokenkind.h \
    gen_tokenkindprec.c \
    gen_parsepunctuator.c \
    gen_parsekeyword.c

$(generated_files) : tokenkind.txt xgen_tokenkind
        ./xgen_tokenkind $<
#
# Define list of source files, object files, targets, etc
#

# all source files
src :=\
    $(filter-out gen_%,\
        $(wildcard *.c))

# all object files
obj :=\
    $(patsubst %.c,%.o,\
        $(src))

# all targets (test programs)
target :=\
    $(filter xtest%,\
        $(patsubst %.c,%,\
            $(src)))

# all generators for source files
generator :=\
    $(filter xgen%,\
        $(patsubst %.c,%,\
            $(src)))

# objects that are required by the targets
lib.o :=\
    $(filter-out xtest% xgen%,\
        $(obj))

# dependency file that will be generated by compiler
deps :=\
    $(patsubst %,%.d,\
        $(src))

# dependency file leftovers of gone source files
obsolete.deps:=\
    $(filter-out $(deps),\
        $(wildcard *.c.d))


#
# Build rules
#
.PHONY: all
.DEFAULT_GOAL := all
all: $(target) $(obj) $(generator)

# rule for removing obsolete dependency files
.PHONY: $(obsolete.deps)
$(obsolete.deps) :
        $(RM) $(obsolete.deps)

# delete implicit rule for building an executable directly from its source file
% : %.c

# rule for source file generators
xgen% : xgen%.c
        $(CC) -o $@ $^ $(LDFLAGS)

# our rule: to build target link its object file against library object files
%: %.o $(lib.o) | $(obsolete.deps)
        $(CC) -o $@ $^ $(LDFLAGS)

# our rule to build objects: also generate a dependency file
%.o: %.c | $(obsolete.deps) $(generated_files)
        $(CC) -c $(CPPFLAGS) $(CFLAGS) -MT $@ -MMD -MP -MF $<.d $<

.PHONY: clean
clean:
        $(RM) $(target) $(generator) $(obj) $(deps) $(obsolete.deps)
        $(RM) $(generated_files)

#
# Include dependencies (if already generated)
#
-include $(deps)
#include <assert.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void
usage(const char *prg)
{
    fprintf(stderr, "usage: %s tokenkind.txt\n", prg);
    exit(1);
}

/*
 * Similar to Perl: Chops of the trailing newline (actually '\n' on Unix and
 * '\r\n' on DOS) by overwriting it with a null byte. 
 */
void
chopNl(char *line, size_t len)
{
    line[len - 2] == '\r' ?  (line[len -2] = 0) : (line[len - 1] = 0);
}

bool
isSpace(int ch)
{
    return ch == ' ' || ch == '\r' || ch == '\f' || ch == '\v' || ch == '\t';
}

static bool
isLetter(int ch)
{
    return ((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A' && ch <= 'Z')) ||
           ch == '_';
}

/*
 * Split white space separated column with format
 * [<tokenkind> [<tokenKindVal> [<prec> <exprKind]]]
 */
void
split(char *line, char **tokenKind, char **tokenKindVal, int *prec,
      char **exprKind)
{
    *tokenKind = line;
    *tokenKindVal = 0;
    *prec = 0;

    for (; *line && !isSpace(*line); ++line) {
    }
    if (!isSpace(*line)) {
        return;
    }
    *line = 0;

    // advance to next column
    while (isSpace(*++line)) {
    }
    if (!*line) {
        fprintf(stderr, "%s: <tokenKindVal> field expected\n", *tokenKind);
        exit(1);
    }
    *tokenKindVal = line;
    for (; *line && !isSpace(*line); ++line) {
    }
    if (!isSpace(*line)) {
        return;
    }
    *line = 0;


    // advance to next column
    while (isSpace(*++line)) {
    }
    if (sscanf(line, "%d", prec) != 1) {
        fprintf(stderr, "%s: prec field expected\n", *tokenKind);
        exit(1);
    }

    // advance to next column
    for (; *line && !isSpace(*line); ++line) {
    }
    while (isSpace(*++line)) {
    }
    if (!*line) {
        fprintf(stderr, "%s: expression kind expected\n", *tokenKind);
        exit(1);
    }
    *exprKind = line;
}

void
printIndent(FILE *out, int indentLevel)
{
    assert(indentLevel >= 0);
    fprintf(out, "%*s", indentLevel * 4, "");
}

/*
 * See https://cplusplus.com/reference/cstdarg/va_list/ for details on how
 * variable arguments can be handled platform independent.
 */
void
printCode(FILE *out, int indentLevel, const char *fmt, ...)
{
    printIndent(out, indentLevel);
    if (!fmt) {
        return;
    }
    va_list argp;
    va_start(argp, fmt);
    vfprintf(out, fmt, argp);
    va_end(argp);
}

enum OutSelect {
    OUT_TOKEN_KIND,
    OUT_STR_TOKEN_KIND,
    OUT_TOKEN_KIND_PREC,
    OUT_MAKE_BINARY_EXPR_KIND,
    OUT_PARSE_PUNCTUATOR,
    OUT_PARSE_KEYWORD,
    NUM_OUT,
};

const char *outFilename[NUM_OUT] = {
    "gen_tokenkind.h",
    "gen_strtokenkind.c",
    "gen_tokenkindprec.c",
    "gen_makebinaryexprkind.c",
    "gen_parsepunctuator.c",
    "gen_parsekeyword.c",
};

FILE *outFile[NUM_OUT];

void
closeOutFiles(void)
{
    for (int i = 0; i < NUM_OUT; ++i) {
        if (outFile[i]) {
            fclose(outFile[i]);
        }
    }
}

void
openOutFiles(void)
{
    for (int i = 0; i < NUM_OUT; ++i) {
        outFile[i] = fopen(outFilename[i], "w");
        if (!outFile[i]) {
            fprintf(stderr, "can not open output file %s\n", outFilename[i]);
            closeOutFiles();
            exit(1);
        }
    }
}

void
printHeader(enum OutSelect outSelect)
{
    FILE *out = outFile[outSelect];
    switch (outSelect) {
        case OUT_TOKEN_KIND:
            printCode(out, 0, "enum TokenKind\n");
            printCode(out, 0, "{\n");
            return;
        case OUT_STR_TOKEN_KIND:
            printCode(out, 0, "const char *\n");
            printCode(out, 0, "strTokenKind(enum TokenKind tokenKind)\n");
            printCode(out, 0, "{\n");
            printCode(out, 1, "switch (tokenKind) {\n");
            return;
        case OUT_TOKEN_KIND_PREC:
            printCode(out, 0, "static int\n");
            printCode(out, 0, "tokenKindPrec(enum TokenKind tokenKind)\n");
            printCode(out, 0, "{\n");
            printCode(out, 1, "switch (tokenKind) {\n");
            return;
        case OUT_MAKE_BINARY_EXPR_KIND:
            printCode(out, 0, "static enum ExprKind\n");
            printCode(out, 0, "makeBinaryExprKind(enum TokenKind tokenKind)\n");
            printCode(out, 0, "{\n");
            printCode(out, 1, "switch (tokenKind) {\n");
            return;
        case OUT_PARSE_PUNCTUATOR:
        case OUT_PARSE_KEYWORD:
            return;
        default:
            assert(0);
    }
}

void
printFooter(enum OutSelect outSelect)
{
    FILE *out = outFile[outSelect];
    switch (outSelect) {
        case OUT_TOKEN_KIND:
            printCode(out, 0, "};\n");
            return;
        case OUT_STR_TOKEN_KIND:
            printCode(out, 1, "default:\n");
            printCode(out, 2, "fprintf(stderr, \"internal error in strTokenKind: "
                      "tokenKind = %%d\",\n");
            printCode(out, 3, "tokenKind);\n");
            printCode(out, 2, "finalizeExit(1);\n");
            printCode(out, 2, "return \"\";\n");
            printCode(out, 1, "}\n");
            printCode(out, 0, "}\n");
            return;
        case OUT_TOKEN_KIND_PREC:
            printCode(out, 1, "default:\n");
            printCode(out, 2, "return 0;\n");
            printCode(out, 1, "}\n");
            printCode(out, 0, "}\n");
            return;
        case OUT_MAKE_BINARY_EXPR_KIND:
            printCode(out, 1, "default:\n");
            printCode(out, 2, "fprintf(stderr, \"internal error in "
                      "makeBinaryExprKind (tokenKind = %%d)\",\n");
            printCode(out, 3, "tokenKind);\n");
            printCode(out, 2, "finalizeExit(1);\n");
            printCode(out, 2, "return 0;\n");
            printCode(out, 1, "}\n");
            printCode(out, 0, "}\n");
            return;
        case OUT_PARSE_PUNCTUATOR:
        case OUT_PARSE_KEYWORD:
            return;
        default:
            assert(0);
    }
}

void
printHeaders(void)
{
    for (int i = 0; i < NUM_OUT; ++i) {
        printHeader(i);
    }
}

void
printFooters(void)
{
    for (int i = 0; i < NUM_OUT; ++i) {
        printFooter(i);
    }
}

void
printTokenKind(const char *tk)
{
    FILE *out = outFile[OUT_TOKEN_KIND];
    printCode(out, 1, "%s,\n", tk);
}

void
printStrTokenKind(const char *tk)
{
    FILE *out = outFile[OUT_STR_TOKEN_KIND];
    printCode(out, 1, "case %s:\n", tk);
    printCode(out, 2, "return \"%s\";\n", tk);
}

void
printTokenKindPrec(const char *tk, int prec)
{
    FILE *out = outFile[OUT_TOKEN_KIND_PREC];
    printCode(out, 1, "case %s:\n", tk);
    printCode(out, 2, "return %d;\n", prec);
}

void
printMakeBinaryExprKind(const char *tk, const char *exprKind)
{
    FILE *out = outFile[OUT_MAKE_BINARY_EXPR_KIND];
    printCode(out, 1, "case %s:\n", tk);
    printCode(out, 2, "return %s;\n", exprKind);

}

//------------------------------------------------------------------------------

struct CharNode
{
    struct CharNode *next[256];
    char *tokenKind;
} lexTreePunctuator, lexTreeKeyword;

void
lexTreeAdd(const char *tk, const char *tkVal)
{
    struct CharNode *n = isLetter(*tkVal)
        ? &lexTreeKeyword
        : &lexTreePunctuator;
    const char *s = tkVal;
    for (size_t c; (c = *s++); n = n->next[c]) {
        if (!n->next[c]) {
            n->next[c] = calloc(1, sizeof(*n->next[c]));
            if (!n->next[c]) {
                fprintf(stderr, "lexTreeAdd: out of memeory\n");
                exit(1);
            }
        }
    }
    n->tokenKind = strdup(tk);
}

void
lexTreeDestroy_(struct CharNode *n, struct CharNode *root)
{
    for (int i = 0; i < sizeof(n->next) / sizeof(n->next[0]); ++i) {
        if (n->next[i]) {
            lexTreeDestroy_(n->next[i], root);
        }
    }
    if (n->tokenKind) {
        free(n->tokenKind);
    }
    if (n != root) {
        free(n);
    }
}

void
lexTreeDestroy(void)
{
    lexTreeDestroy_(&lexTreePunctuator, &lexTreePunctuator);
    lexTreeDestroy_(&lexTreeKeyword, &lexTreeKeyword);
}

void
printParsePunctuator_(FILE *out, const struct CharNode *n, int level)
{
    int identLevel = level + 1;
    for (size_t i = 0; i < sizeof(n->next) / sizeof(n->next[0]); ++i) {
        if (!n->next[i]) {
            continue;
        }
        printCode(out, identLevel, "%sif (ch == '%c') {\n",
                  level ? "" : "} else ", (char)i);
        printCode(out, identLevel + 1, "appendCharToStr(&token.val, ch);\n");
        printCode(out, identLevel + 1, "nextCh();\n");
        printParsePunctuator_(out, n->next[i], level + 1);
        if (level) {
            printCode(out, identLevel, "}\n");
        }
    }
    if (n->tokenKind) {
        printCode(out, identLevel, "return token.kind = %s;\n", n->tokenKind);
    }
}

void
printParsePunctuator(void)
{
    printParsePunctuator_(outFile[OUT_PARSE_PUNCTUATOR], &lexTreePunctuator, 0);
}

void
printParseKeyword_(FILE *out, const struct CharNode *n, int level)
{
    int identLevel = level + 1;
    for (size_t i = 0; i < sizeof(n->next) / sizeof(n->next[0]); ++i) {
        if (!n->next[i]) {
            continue;
        }
        printCode(out, identLevel, "%sif (ch == '%c') {\n",
                  level ? "" : "} else ", (char)i);
        printCode(out, identLevel + 1, "appendCharToStr(&token.val, ch);\n");
        printCode(out, identLevel + 1, "nextCh();\n");
        printParseKeyword_(out, n->next[i], level + 1);
        if (level) {
            printCode(out, identLevel, "}\n");
        }
    }
    if (n->tokenKind) {
        printCode(out, identLevel, "if (!isLetter(ch) || isDecDigit(ch)) {\n");
        printCode(out, identLevel + 1, "return token.kind = %s;\n",
                  n->tokenKind);
        printCode(out, identLevel, "}\n");
    }
}

void
printParseKeyword(void)
{
    printParseKeyword_(outFile[OUT_PARSE_KEYWORD], &lexTreeKeyword, 1);
}


//------------------------------------------------------------------------------

int
main(int argc, char *argv[])
{
    if (argc != 2) {
        usage(argv[0]);
    }

    FILE *in = fopen(argv[1], "r");
    if (!in) {
        fprintf(stderr, "can not open input file '%s'\n", argv[1]);
    }

    openOutFiles();
    printHeaders();

    char *line = 0;
    size_t capacity = 0;
    ssize_t len;
    while ((len = getline(&line, &capacity, in)) > 0) {
        chopNl(line, len);

        char *tokenKind, *tokenKindVal, *exprKind;
        int prec;

        split(line, &tokenKind, &tokenKindVal, &prec, &exprKind);
        if (!*line) {
            continue;
        }

        if (tokenKind) {
            printTokenKind(tokenKind);
            printStrTokenKind(tokenKind);
        }
        if (tokenKindVal) {
            lexTreeAdd(tokenKind, tokenKindVal);
        }
        if (prec) {
            printTokenKindPrec(tokenKind, prec);
            printMakeBinaryExprKind(tokenKind, exprKind);
        }
    }
    free(line);

    printParsePunctuator();
    printParseKeyword();

    printFooters();

    fclose(in);
    closeOutFiles();
    lexTreeDestroy();
}
EOI
BAD_TOKEN
DEC_LITERAL
HEX_LITERAL
OCT_LITERAL
IDENTIFIER

AMPERSAND       &
AMPERSAND2      &&
ASTERISK        *       13      EK_MUL
CARET           ^
DOLLAR          $
EQUAL           =
EQUAL2          ==      9       EK_EQUAL
NOT             !
NOT_EQUAL       !=      9       EK_NOT_EQUAL
GREATER         >       10      EK_GREATER
GREATER_EQUAL   >=      10      EK_GREATER_EQUAL
LBRACE          {
LESS            <       10      EK_LESS
LESS_EQUAL      <=      10      EK_LESS_EQUAL
LPAREN          (
MINUS           -       12      EK_SUB
PERCENT         %       13      EK_MOD
PLUS            +       12      EK_ADD
RBRACE          }
RPAREN          )
SEMICOLON       ;
SLASH           /       13      EK_DIV
TILDE           ~
VBAR            |
VBAR2           ||

FOR for
WHILE while
DO do
IF if
ELSE else

In addition to gen_tokenkind.h and gen_strtokenkind.c also the files gen_parsepunctuator.c, gen_parsekeyword.c, gen_makebinaryexprkind.c and gen_tokenkindprec.c, will be generated.

Format of tokenkind.txt

The description in file tokenkind.txt now can have empty lines. These lines will be ignored. The field format can be described by

1
[<tokenkind> [<tokenkindval> [<prec> <exprkind>]]]

where brackets indicate that its content is optional.

Field <tokenkind>

As before the first field of a line contains the identifier of a token. From this field the enum constants in gen_tokenkind.h

enum TokenKind
{
    EOI,
    BAD_TOKEN,
    DEC_LITERAL,
    HEX_LITERAL,
    OCT_LITERAL,
    IDENTIFIER,
    AMPERSAND,
    AMPERSAND2,
    ASTERISK,
    CARET,
    DOLLAR,
    EQUAL,
    EQUAL2,
    NOT,
    NOT_EQUAL,
    GREATER,
    GREATER_EQUAL,
    LBRACE,
    LESS,
    LESS_EQUAL,
    LPAREN,
    MINUS,
    PERCENT,
    PLUS,
    RBRACE,
    RPAREN,
    SEMICOLON,
    SLASH,
    TILDE,
    VBAR,
    VBAR2,
    FOR,
    WHILE,
    DO,
    IF,
    ELSE,
};

and the implementation of strTokenKind() is generated

const char *
strTokenKind(enum TokenKind tokenKind)
{
    switch (tokenKind) {
    case EOI:
        return "EOI";
    case BAD_TOKEN:
        return "BAD_TOKEN";
    case DEC_LITERAL:
        return "DEC_LITERAL";
    case HEX_LITERAL:
        return "HEX_LITERAL";
    case OCT_LITERAL:
        return "OCT_LITERAL";
    case IDENTIFIER:
        return "IDENTIFIER";
    case AMPERSAND:
        return "AMPERSAND";
    case AMPERSAND2:
        return "AMPERSAND2";
    case ASTERISK:
        return "ASTERISK";
    case CARET:
        return "CARET";
    case DOLLAR:
        return "DOLLAR";
    case EQUAL:
        return "EQUAL";
    case EQUAL2:
        return "EQUAL2";
    case NOT:
        return "NOT";
    case NOT_EQUAL:
        return "NOT_EQUAL";
    case GREATER:
        return "GREATER";
    case GREATER_EQUAL:
        return "GREATER_EQUAL";
    case LBRACE:
        return "LBRACE";
    case LESS:
        return "LESS";
    case LESS_EQUAL:
        return "LESS_EQUAL";
    case LPAREN:
        return "LPAREN";
    case MINUS:
        return "MINUS";
    case PERCENT:
        return "PERCENT";
    case PLUS:
        return "PLUS";
    case RBRACE:
        return "RBRACE";
    case RPAREN:
        return "RPAREN";
    case SEMICOLON:
        return "SEMICOLON";
    case SLASH:
        return "SLASH";
    case TILDE:
        return "TILDE";
    case VBAR:
        return "VBAR";
    case VBAR2:
        return "VBAR2";
    case FOR:
        return "FOR";
    case WHILE:
        return "WHILE";
    case DO:
        return "DO";
    case IF:
        return "IF";
    case ELSE:
        return "ELSE";
    default:
        fprintf(stderr, "internal error in strTokenKind: tokenKind = %d",
            tokenKind);
        finalizeExit(1);
        return "";
    }
}

Field <tokenkindval>

From this field code for detecting punctuator and keyword tokens gets generated.

If the first character is not a letter (i.e. 'a', ..., 'z', 'A', ..., 'Z' or '_') it is considered as a punctuator. For the lexer it generates the following code fragment for parsing these punctuators (this should look familiar):

    } else if (ch == '!') {
        appendCharToStr(&token.val, ch);
        nextCh();
        if (ch == '=') {
            appendCharToStr(&token.val, ch);
            nextCh();
            return token.kind = NOT_EQUAL;
        }
        return token.kind = NOT;
    } else if (ch == '$') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = DOLLAR;
    } else if (ch == '%') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = PERCENT;
    } else if (ch == '&') {
        appendCharToStr(&token.val, ch);
        nextCh();
        if (ch == '&') {
            appendCharToStr(&token.val, ch);
            nextCh();
            return token.kind = AMPERSAND2;
        }
        return token.kind = AMPERSAND;
    } else if (ch == '(') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = LPAREN;
    } else if (ch == ')') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = RPAREN;
    } else if (ch == '*') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = ASTERISK;
    } else if (ch == '+') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = PLUS;
    } else if (ch == '-') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = MINUS;
    } else if (ch == '/') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = SLASH;
    } else if (ch == ';') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = SEMICOLON;
    } else if (ch == '<') {
        appendCharToStr(&token.val, ch);
        nextCh();
        if (ch == '=') {
            appendCharToStr(&token.val, ch);
            nextCh();
            return token.kind = LESS_EQUAL;
        }
        return token.kind = LESS;
    } else if (ch == '=') {
        appendCharToStr(&token.val, ch);
        nextCh();
        if (ch == '=') {
            appendCharToStr(&token.val, ch);
            nextCh();
            return token.kind = EQUAL2;
        }
        return token.kind = EQUAL;
    } else if (ch == '>') {
        appendCharToStr(&token.val, ch);
        nextCh();
        if (ch == '=') {
            appendCharToStr(&token.val, ch);
            nextCh();
            return token.kind = GREATER_EQUAL;
        }
        return token.kind = GREATER;
    } else if (ch == '^') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = CARET;
    } else if (ch == '{') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = LBRACE;
    } else if (ch == '|') {
        appendCharToStr(&token.val, ch);
        nextCh();
        if (ch == '|') {
            appendCharToStr(&token.val, ch);
            nextCh();
            return token.kind = VBAR2;
        }
        return token.kind = VBAR;
    } else if (ch == '}') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = RBRACE;
    } else if (ch == '~') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = TILDE;

Otherwise the field describes a keyword. In the last session parsed identifiers were compared against a list of reserved strings to check if they are actually keywords. For handwritten code this is a reasonable approach. With generated code this can be done more efficient. For the lexer the following code gets generated to detect keywords:

        if (ch == 'd') {
            appendCharToStr(&token.val, ch);
            nextCh();
            if (ch == 'o') {
                appendCharToStr(&token.val, ch);
                nextCh();
                if (!isLetter(ch) || isDecDigit(ch)) {
                    return token.kind = DO;
                }
            }
        }
        if (ch == 'e') {
            appendCharToStr(&token.val, ch);
            nextCh();
            if (ch == 'l') {
                appendCharToStr(&token.val, ch);
                nextCh();
                if (ch == 's') {
                    appendCharToStr(&token.val, ch);
                    nextCh();
                    if (ch == 'e') {
                        appendCharToStr(&token.val, ch);
                        nextCh();
                        if (!isLetter(ch) || isDecDigit(ch)) {
                            return token.kind = ELSE;
                        }
                    }
                }
            }
        }
        if (ch == 'f') {
            appendCharToStr(&token.val, ch);
            nextCh();
            if (ch == 'o') {
                appendCharToStr(&token.val, ch);
                nextCh();
                if (ch == 'r') {
                    appendCharToStr(&token.val, ch);
                    nextCh();
                    if (!isLetter(ch) || isDecDigit(ch)) {
                        return token.kind = FOR;
                    }
                }
            }
        }
        if (ch == 'i') {
            appendCharToStr(&token.val, ch);
            nextCh();
            if (ch == 'f') {
                appendCharToStr(&token.val, ch);
                nextCh();
                if (!isLetter(ch) || isDecDigit(ch)) {
                    return token.kind = IF;
                }
            }
        }
        if (ch == 'w') {
            appendCharToStr(&token.val, ch);
            nextCh();
            if (ch == 'h') {
                appendCharToStr(&token.val, ch);
                nextCh();
                if (ch == 'i') {
                    appendCharToStr(&token.val, ch);
                    nextCh();
                    if (ch == 'l') {
                        appendCharToStr(&token.val, ch);
                        nextCh();
                        if (ch == 'e') {
                            appendCharToStr(&token.val, ch);
                            nextCh();
                            if (!isLetter(ch) || isDecDigit(ch)) {
                                return token.kind = WHILE;
                            }
                        }
                    }
                }
            }
        }

Fields <prec> and <exprkind>

Either both fields have to be present or none. From these fields tokenKindPrec() in gen_tokenkindprec.c and makeBinaryExprKind() in gen_makebinaryexprkind.c get generated:

static int
tokenKindPrec(enum TokenKind tokenKind)
{
    switch (tokenKind) {
    case ASTERISK:
        return 13;
    case EQUAL2:
        return 9;
    case NOT_EQUAL:
        return 9;
    case GREATER:
        return 10;
    case GREATER_EQUAL:
        return 10;
    case LESS:
        return 10;
    case LESS_EQUAL:
        return 10;
    case MINUS:
        return 12;
    case PERCENT:
        return 13;
    case PLUS:
        return 12;
    case SLASH:
        return 13;
    default:
        return 0;
    }
}
static enum ExprKind
makeBinaryExprKind(enum TokenKind tokenKind)
{
    switch (tokenKind) {
    case ASTERISK:
        return EK_MUL;
    case EQUAL2:
        return EK_EQUAL;
    case NOT_EQUAL:
        return EK_NOT_EQUAL;
    case GREATER:
        return EK_GREATER;
    case GREATER_EQUAL:
        return EK_GREATER_EQUAL;
    case LESS:
        return EK_LESS;
    case LESS_EQUAL:
        return EK_LESS_EQUAL;
    case MINUS:
        return EK_SUB;
    case PERCENT:
        return EK_MOD;
    case PLUS:
        return EK_ADD;
    case SLASH:
        return EK_DIV;
    default:
        fprintf(stderr, "internal error in makeBinaryExprKind (tokenKind = %d)",
            tokenKind);
        finalizeExit(1);
        return 0;
    }
}

Updating the Parser

In the parser the implementation of tokenKindPrec() and makeBinaryExprKind() now simply gets include:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
/*
 *  static int tokenKindPrec(enum TokenKind kind);
 *  
 *  Returns 0 if kind is not a left associative binary operator.
 *  Otherwise returns a precedence > 0
 */
#include "gen_tokenkindprec.c"

/*
 *  enum ExprKind makeBinaryExprKind(enum TokenKind kind);
 *
 *  For left associative binary operators translates 'enum TokenKind' into
 *  'enum ExprKind'
 */
#include "gen_makebinaryexprkind.c"

const struct Expr *
parseLeftAssocBinaryExpr(int prec)
{
    /* ... as before ... */
}

Updating the Lexer

Function getToken(void) has the following structure:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
enum TokenKind
getToken(void)
{
    /* ... */

    if (ch == EOF) {
        return token.kind = EOI;
    } else if (isDecDigit(ch)) {
        // parse literal
        /* ... */
    // parsing punctuators
    } else if (ch == '&') {
        /* ... */
    // parsing keywords and identifiers
    } else if (isLetter(ch)) {
        do {
            appendCharToStr(&token.val, ch);
            nextCh();
        } while (isLetter(ch) || isDecDigit(ch));
        return token.kind = checkForKeyword(token.val.cstr);
    }

    nextCh();
    return token.kind = BAD_TOKEN;
}

The part for parsing the punctuators can simply be included. For detecting keywords function checkForKeyword() is no longer needed. Instead first the code in gen_parsekeyword.c is used to detect keywords. If a keyword is found function getToken() returns. Hence, only if the code afterwards is reached the is an identifier in the input stream. With a while loop the remaining part of the identifier gets collected:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
enum TokenKind
getToken(void)
{
    /* ... */

    if (ch == EOF) {
        return token.kind = EOI;
    } else if (isDecDigit(ch)) {
        // parse unsigned integer literal
        /* ... */
    // parsing punctuators
        #include "gen_parsepunctuator.c"
    // parsing keywords and identifiers
    } else if (isLetter(ch)) {
        // First detected keywords ...
        #include "gen_parsekeyword.c"
        // ... if there was no keyword detected it is an identifier
        while (isLetter(ch) || isDecDigit(ch)) {
            appendCharToStr(&token.val, ch);
            nextCh();
        }
        return token.kind = IDENTIFIER;
    }

    nextCh();
    return token.kind = BAD_TOKEN;
}

Here the complete updated implementation of lexer.c:

#include <stdbool.h>
#include <stdio.h>

#include "finalize.h"
#include "lexer.h"
#include "ustr.h"

struct Token token;

static void
cleanup(void)
{
    releaseStr(&token.val);
}

//------------------------------------------------------------------------------

// position of current character ch
static struct TokenPos curr = {
    1,
    0,
};

static int ch;

static int
nextCh(void)
{
    ++curr.col;
    ch = getchar();
    if (ch == '\n') {
        ++curr.line;
        curr.col = 0;
    }
    return ch;
}

static bool
isWhiteSpace(int ch)
{
    return ch == ' ' || ch == '\t';
}

static bool
isDecDigit(int ch)
{
    return ch >= '0' && ch <= '9';
}

static bool
isOctDigit(int ch)
{
    return ch >= '0' && ch <= '7';
}

static bool
isHexDigit(int ch)
{
    return isDecDigit(ch) || (ch >= 'a' && ch <= 'f') ||
           (ch >= 'A' && ch <= 'F');
}

static bool
isLetter(int ch)
{
    return ((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A' && ch <= 'Z')) ||
           ch == '_';
}

enum TokenKind
getToken(void)
{
    static bool first = true;
    if (first) {
        first = false;
        finalizeRegister(cleanup);
    }

    // init ch, skip white spaces and newlines
    while (ch == 0 || isWhiteSpace(ch) || ch == '\n') {
        nextCh();
    }

    token.pos.line = curr.line;
    token.pos.col = curr.col;

    clearStr(&token.val);

    if (ch == EOF) {
        return token.kind = EOI;
    } else if (isDecDigit(ch)) {
        // parse literal
        if (ch == '0') {
            appendCharToStr(&token.val, ch);
            nextCh();
            if (ch == 'x') {
                appendCharToStr(&token.val, ch);
                nextCh();
                if (isHexDigit(ch)) {
                    while (isHexDigit(ch)) {
                        appendCharToStr(&token.val, ch);
                        nextCh();
                    }
                    return token.kind = HEX_LITERAL;
                }
                return token.kind = BAD_TOKEN;
            }
            while (isOctDigit(ch)) {
                appendCharToStr(&token.val, ch);
                nextCh();
            }
            return token.kind = OCT_LITERAL;
        } else if (isDecDigit(ch)) {
            while (isDecDigit(ch)) {
                appendCharToStr(&token.val, ch);
                nextCh();
            }
            return token.kind = DEC_LITERAL;
        }
    // parsing punctuators
        #include "gen_parsepunctuator.c"
    // parsing keywords and identifiers
    } else if (isLetter(ch)) {
        // First detected keywords ...
        #include "gen_parsekeyword.c"
        // ... if there was no keyword detected it is an identifier
        while (isLetter(ch) || isDecDigit(ch)) {
            appendCharToStr(&token.val, ch);
            nextCh();
        }
        return token.kind = IDENTIFIER;
    }

    nextCh();
    return token.kind = BAD_TOKEN;
}