CPW Part 3: String Class for the Lexer
Quiz16: ABC Compiler Project
Improve the string class implementation in str.c so that we can support tokens with arbitrary many characters.
1 | submit hpc quiz16 str.h str.c
|
Here my source files used in the video:
theon$ git checkout tags/quiz16 HEAD is now at 29737ce Quiz 16 material theon$
#include <stdbool.h>
#include <stdio.h>
#include "lexer.h"
struct Token token;
//------------------------------------------------------------------------------
// position of current character ch
static struct TokenPos curr = {
1,
0,
};
static int ch;
static int
nextCh(void)
{
++curr.col;
ch = getchar();
if (ch == '\n') {
++curr.line;
curr.col = 0;
}
return ch;
}
static bool
isWhiteSpace(int ch)
{
return ch == ' ' || ch == '\t';
}
static bool
isDecDigit(int ch)
{
return ch >= '0' && ch <= '9';
}
static bool
isOctDigit(int ch)
{
return ch >= '0' && ch <= '7';
}
static bool
isHexDigit(int ch)
{
return isDecDigit(ch) || (ch >= 'a' && ch <= 'f') ||
(ch >= 'A' && ch <= 'F');
}
static bool
isLetter(int ch)
{
return ((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A' && ch <= 'Z')) ||
ch == '_';
}
enum TokenKind
getToken(void)
{
unsigned long long val = 0;
// init ch, skip white spaces and newlines
while (ch == 0 || isWhiteSpace(ch) || ch == '\n') {
nextCh();
}
token.pos.line = curr.line;
token.pos.col = curr.col;
clearStr(&token.val);
if (ch == EOF) {
return 0; // EOI
} else if (isDecDigit(ch)) {
// parse literal
if (ch == '0') {
appendCharToStr(&token.val, ch);
nextCh();
if (ch == 'x') {
appendCharToStr(&token.val, ch);
nextCh();
if (isHexDigit(ch)) {
while (isHexDigit(ch)) {
appendCharToStr(&token.val, ch);
nextCh();
}
return token.kind = HEX_LITERAL;
}
return token.kind = BAD_TOKEN;
}
while (isOctDigit(ch)) {
appendCharToStr(&token.val, ch);
nextCh();
}
return token.kind = OCT_LITERAL;
} else if (isDecDigit(ch)) {
while (isDecDigit(ch)) {
appendCharToStr(&token.val, ch);
nextCh();
}
return token.kind = DEC_LITERAL;
}
} else if (ch == '+') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = PLUS;
} else if (ch == '-') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = MINUS;
} else if (ch == '*') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = ASTERISK;
} else if (ch == '/') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = SLASH;
} else if (ch == '%') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = PERCENT;
} else if (ch == '=') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = EQUAL;
} else if (ch == '(') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = LPAREN;
} else if (ch == ')') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = RPAREN;
} else if (ch == ';') {
appendCharToStr(&token.val, ch);
nextCh();
return token.kind = SEMICOLON;
} else if (isLetter(ch)) {
do {
appendCharToStr(&token.val, ch);
nextCh();
} while (isLetter(ch) || isDecDigit(ch));
return token.kind = IDENTIFIER;
}
nextCh();
return token.kind = BAD_TOKEN;
}
#ifndef ABC_LEXER_H
#define ABC_LEXER_H
#include <stddef.h>
#include "str.h"
#include "tokenkind.h"
enum TokenKind getToken(void);
struct Token
{
enum TokenKind kind;
struct TokenPos
{
size_t line, col;
} pos;
struct Str val;
};
extern struct Token token;
#endif // ABC_LEXER_H
#
# patch: If user has not defined CC and default value does not exist use gcc
#
ifeq ($(origin CC),default)
cc_check := $(shell $(CC) -v > /dev/null 2>&1 && echo "sane")
ifneq ($(strip $(cc_check)),sane)
CC := gcc
endif
endif
#
# Define list of source files, object files, targets, etc
#
# all source files
src :=\
$(wildcard *.c)
# all object files
obj :=\
$(patsubst %.c,%.o,\
$(src))
# all targets
target :=\
$(filter xtest%,\
$(patsubst %.c,%,\
$(src)))
# objects that are required by the targets
lib.o :=\
$(filter-out xtest%,\
$(obj))
# dependency file that will be generated by compiler
deps :=\
$(patsubst %,%.d,\
$(src))
# dependency file leftovers of gone source files
obsolete.deps:=\
$(filter-out $(deps),\
$(wildcard *.c.d))
#
# Build rules
#
.PHONY: all
all: $(target) $(obj)
# rule for removing obsolete dependency files
.PHONY: $(obsolete.deps)
$(obsolete.deps) :
$(RM) $(obsolete.deps)
# delete implicit rule for building an executable directly from its source file
% : %.c
# our rule: to build target link its object file against library object files
%: %.o $(obj) | $(obsolete.deps)
$(CC) -o $@ $(LDFLAGS) $^
# our rule to build objects: also generate a dependency file
%.o: %.c | $(obsolete.deps)
$(CC) -c $(CPPFLAGS) $(CFLAGS) -MT $@ -MMD -MP -MF $<.d $<
.PHONY: clean
clean:
$(RM) $(target) $(obj) $(deps) $(obsolete.deps)
#
# Include dependencies (if already generated)
#
-include $(deps)
ABC: ABC is A Bloody Compiler
=============================
This is our compiler project
## How to use
Use `make` to ...
#ifndef ABC_STR_H
#define ABC_STR_H
struct Str
{
char cstr[7];
char *end;
};
// set str->cstr to empty string
void clearStr(struct Str *str);
// append character to str->cstr
void appendCharToStr(struct Str *str, char c);
#endif // ABC_STR_H
#include <stdlib.h>
#include <stdio.h>
#include "str.h"
void
clearStr(struct Str *str)
{
*(str->end = str->cstr) = 0;
}
void
appendCharToStr(struct Str *str, char c)
{
// check if another character and 0 byte fits into string
if (str->end - str->cstr + 2 > sizeof(str->cstr)) {
fprintf(stderr, "error in appendCharToStr: string too long\n");
exit(1);
}
*str->end++ = c;
*str->end = 0;
}
1 2 3 4 | a = 5;
b = 42;
c = (a + b) *2;
123 0123 0xaB12 abc +-/*%^()
|
#ifndef ABC_TOKENKIND_H
#define ABC_TOKENKIND_H
enum TokenKind
{
EOI, // end of input
BAD_TOKEN,
HEX_LITERAL,
OCT_LITERAL,
DEC_LITERAL,
PLUS, // '+'
MINUS, // '-'
ASTERISK, // '*'
SLASH, // '/'
PERCENT, // '%'
EQUAL, // '='
LPAREN, // '('
RPAREN, // ')'
SEMICOLON, // ';'
IDENTIFIER,
};
const char *strTokenKind(enum TokenKind tokenKind);
#endif // ABC_TOKENKIND_H
#include <stdio.h>
#include <stdlib.h>
#include "tokenkind.h"
const char *
strTokenKind(enum TokenKind tokenKind)
{
switch (tokenKind) {
case EOI:
return "EOI";
case BAD_TOKEN:
return "BAD_TOKEN";
case HEX_LITERAL:
return "HEX_LITERAL";
case OCT_LITERAL:
return "OCT_LITERAL";
case DEC_LITERAL:
return "DEC_LITERAL";
case PLUS:
return "PLUS";
case MINUS:
return "MINUS";
case ASTERISK:
return "ASTERISK";
case SLASH:
return "SLASH";
case PERCENT:
return "PERCENT";
case EQUAL:
return "EQUAL";
case LPAREN:
return "LPAREN";
case RPAREN:
return "RPAREN";
case SEMICOLON:
return "SEMICOLON";
case IDENTIFIER:
return "IDENTIFIER";
default:
fprintf(stderr, "internal error in strTokenKind: tokenKind = %d\n",
tokenKind);
exit(1);
return "";
}
}
#include <stdio.h>
#include "lexer.h"
int
main(void)
{
while (getToken() != EOI) {
printf("%zu.%zu: %s '%s'\n", token.pos.line, token.pos.col,
strTokenKind(token.kind), token.val.cstr);
}
}