CPW Part 3: String Class for the Lexer

Quiz16: ABC Compiler Project

Improve the string class implementation in str.c so that we can support tokens with arbitrary many characters.

1
submit hpc quiz16 str.h str.c

Here my source files used in the video:

theon$ git checkout tags/quiz16
HEAD is now at 29737ce Quiz 16 material
theon$ 
#include <stdbool.h>
#include <stdio.h>

#include "lexer.h"

struct Token token;

//------------------------------------------------------------------------------

// position of current character ch
static struct TokenPos curr = {
    1,
    0,
};

static int ch;

static int
nextCh(void)
{
    ++curr.col;
    ch = getchar();
    if (ch == '\n') {
        ++curr.line;
        curr.col = 0;
    }
    return ch;
}

static bool
isWhiteSpace(int ch)
{
    return ch == ' ' || ch == '\t';
}

static bool
isDecDigit(int ch)
{
    return ch >= '0' && ch <= '9';
}

static bool
isOctDigit(int ch)
{
    return ch >= '0' && ch <= '7';
}

static bool
isHexDigit(int ch)
{
    return isDecDigit(ch) || (ch >= 'a' && ch <= 'f') ||
           (ch >= 'A' && ch <= 'F');
}

static bool
isLetter(int ch)
{
    return ((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A' && ch <= 'Z')) ||
           ch == '_';
}

enum TokenKind
getToken(void)
{
    unsigned long long val = 0;

    // init ch, skip white spaces and newlines
    while (ch == 0 || isWhiteSpace(ch) || ch == '\n') {
        nextCh();
    }

    token.pos.line = curr.line;
    token.pos.col = curr.col;

    clearStr(&token.val);

    if (ch == EOF) {
        return 0; // EOI
    } else if (isDecDigit(ch)) {
        // parse literal
        if (ch == '0') {
            appendCharToStr(&token.val, ch);
            nextCh();
            if (ch == 'x') {
                appendCharToStr(&token.val, ch);
                nextCh();
                if (isHexDigit(ch)) {
                    while (isHexDigit(ch)) {
                        appendCharToStr(&token.val, ch);
                        nextCh();
                    }
                    return token.kind = HEX_LITERAL;
                }
                return token.kind = BAD_TOKEN;
            }
            while (isOctDigit(ch)) {
                appendCharToStr(&token.val, ch);
                nextCh();
            }
            return token.kind = OCT_LITERAL;
        } else if (isDecDigit(ch)) {
            while (isDecDigit(ch)) {
                appendCharToStr(&token.val, ch);
                nextCh();
            }
            return token.kind = DEC_LITERAL;
        }
    } else if (ch == '+') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = PLUS;
    } else if (ch == '-') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = MINUS;
    } else if (ch == '*') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = ASTERISK;
    } else if (ch == '/') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = SLASH;
    } else if (ch == '%') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = PERCENT;
    } else if (ch == '=') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = EQUAL;
    } else if (ch == '(') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = LPAREN;
    } else if (ch == ')') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = RPAREN;
    } else if (ch == ';') {
        appendCharToStr(&token.val, ch);
        nextCh();
        return token.kind = SEMICOLON;
    } else if (isLetter(ch)) {
        do {
            appendCharToStr(&token.val, ch);
            nextCh();
        } while (isLetter(ch) || isDecDigit(ch));
        return token.kind = IDENTIFIER;
    }

    nextCh();
    return token.kind = BAD_TOKEN;
}
#ifndef ABC_LEXER_H
#define ABC_LEXER_H

#include <stddef.h>

#include "str.h"
#include "tokenkind.h"

enum TokenKind getToken(void);

struct Token
{
    enum TokenKind kind;
    struct TokenPos
    {
        size_t line, col;
    } pos;
    struct Str val;
};

extern struct Token token;

#endif // ABC_LEXER_H
#
# patch: If user has not defined CC and default value does not exist use gcc
#
ifeq ($(origin CC),default)
    cc_check := $(shell $(CC) -v > /dev/null 2>&1 && echo "sane")
    ifneq ($(strip $(cc_check)),sane)
        CC := gcc
    endif
endif

#
# Define list of source files, object files, targets, etc
#

# all source files
src :=\
    $(wildcard *.c)

# all object files
obj :=\
    $(patsubst %.c,%.o,\
        $(src))

# all targets
target :=\
    $(filter xtest%,\
        $(patsubst %.c,%,\
            $(src)))

# objects that are required by the targets
lib.o :=\
    $(filter-out xtest%,\
        $(obj))

# dependency file that will be generated by compiler
deps :=\
    $(patsubst %,%.d,\
        $(src))

# dependency file leftovers of gone source files
obsolete.deps:=\
    $(filter-out $(deps),\
        $(wildcard *.c.d))

#
# Build rules
#
.PHONY: all
all: $(target) $(obj)

# rule for removing obsolete dependency files
.PHONY: $(obsolete.deps)
$(obsolete.deps) :
        $(RM) $(obsolete.deps)

# delete implicit rule for building an executable directly from its source file
% : %.c

# our rule: to build target link its object file against library object files
%: %.o $(obj) | $(obsolete.deps)
        $(CC) -o $@ $(LDFLAGS) $^

# our rule to build objects: also generate a dependency file
%.o: %.c | $(obsolete.deps)
        $(CC) -c $(CPPFLAGS) $(CFLAGS) -MT $@ -MMD -MP -MF $<.d $<

.PHONY: clean
clean:
        $(RM) $(target) $(obj) $(deps) $(obsolete.deps)

#
# Include dependencies (if already generated)
#
-include $(deps)
ABC: ABC is A Bloody Compiler
=============================

This is our compiler project

## How to use

Use `make` to ...
#ifndef ABC_STR_H
#define ABC_STR_H

struct Str
{
    char cstr[7];
    char *end;
};

// set str->cstr to empty string
void clearStr(struct Str *str);

// append character to str->cstr
void appendCharToStr(struct Str *str, char c);

#endif // ABC_STR_H
#include <stdlib.h>
#include <stdio.h>

#include "str.h"

void
clearStr(struct Str *str)
{
    *(str->end = str->cstr) = 0;
}

void
appendCharToStr(struct Str *str, char c)
{
    // check if another character and 0 byte fits into string
    if (str->end - str->cstr + 2 > sizeof(str->cstr)) {
        fprintf(stderr, "error in appendCharToStr: string too long\n");
        exit(1);
    }

    *str->end++ = c;
    *str->end = 0;
}
1
2
3
4
a = 5;
b = 42;
c = (a + b) *2;
123 0123 0xaB12 abc +-/*%^()
#ifndef ABC_TOKENKIND_H
#define ABC_TOKENKIND_H

enum TokenKind
{
    EOI,            // end of input
    BAD_TOKEN,
    HEX_LITERAL,
    OCT_LITERAL,
    DEC_LITERAL,
    PLUS,           // '+'
    MINUS,          // '-'
    ASTERISK,       // '*'
    SLASH,          // '/'
    PERCENT,        // '%'
    EQUAL,          // '='
    LPAREN,         // '('
    RPAREN,         // ')'
    SEMICOLON,      // ';'
    IDENTIFIER,
};

const char *strTokenKind(enum TokenKind tokenKind);

#endif // ABC_TOKENKIND_H
#include <stdio.h>
#include <stdlib.h>

#include "tokenkind.h"

const char *
strTokenKind(enum TokenKind tokenKind)
{
    switch (tokenKind) {
        case EOI:
            return "EOI";
        case BAD_TOKEN:
            return "BAD_TOKEN";
        case HEX_LITERAL:
            return "HEX_LITERAL";
        case OCT_LITERAL:
            return "OCT_LITERAL";
        case DEC_LITERAL:
            return "DEC_LITERAL";
        case PLUS:
            return "PLUS";
        case MINUS:
            return "MINUS";
        case ASTERISK:
            return "ASTERISK";
        case SLASH:
            return "SLASH";
        case PERCENT:
            return "PERCENT";
        case EQUAL:
            return "EQUAL";
        case LPAREN:
            return "LPAREN";
        case RPAREN:
            return "RPAREN";
        case SEMICOLON:
            return "SEMICOLON";
        case IDENTIFIER:
            return "IDENTIFIER";
        default:
            fprintf(stderr, "internal error in strTokenKind: tokenKind = %d\n",
                    tokenKind);
            exit(1);
            return "";
    }
}
#include <stdio.h>

#include "lexer.h"

int
main(void)
{
    while (getToken() != EOI) {
        printf("%zu.%zu: %s '%s'\n", token.pos.line, token.pos.col,
               strTokenKind(token.kind), token.val.cstr);
    }
}