1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#include <stdbool.h>
#include <stdio.h>

#include "lexer.h"

int token_kind;
size_t token_line;
size_t token_col;

//------------------------------------------------------------------------------

// position of current character ch
static size_t curr_line = 1;
static size_t curr_col;

static int ch;

static int
nextCh(void)
{
    ++curr_col;
    ch = getchar();
    if (ch == '\n') {
        ++curr_line;
        curr_col = 0;
    }
    return ch;
}

static bool
isWhiteSpace(int ch)
{
    return ch == ' ' || ch == '\t';
}

static bool
isDecDigit(int ch)
{
    return ch >= '0' && ch <= '9';
}

static bool
isOctDigit(int ch)
{
    return ch >= '0' && ch <= '7';
}

static bool
isHexDigit(int ch)
{
    return isDecDigit(ch) || (ch >= 'a' && ch <= 'f') ||
           (ch >= 'A' && ch <= 'F');
}

static bool
isLetter(int ch)
{
    return ((ch >= 'a') && (ch <= 'z')) || ((ch >= 'A' && ch <= 'Z')) ||
           ch == '_';
}

int
getToken(void)
{
    unsigned long long val = 0;

    // init ch, skip white spaces and newlines
    while (ch == 0 || isWhiteSpace(ch) || ch == '\n') {
        nextCh();
    }

    token_line = curr_line;
    token_col = curr_col;

    if (ch == EOF) {
        return 0; // EOI
    } else if (isDecDigit(ch)) {
        // parse literal
        if (ch == '0') {
            nextCh();
            if (ch == 'x') {
                nextCh();
                if (isHexDigit(ch)) {
                    while (isHexDigit(ch)) {
                        nextCh();
                    }
                    return 2; // HEX_LITERAL
                }
                return 1; // BAD_TOKEN
            }
            while (isOctDigit(ch)) {
                ch -= '0';
                nextCh();
            }
            return 3; // OCT_LITERAL
        } else if (isDecDigit(ch)) {
            while (isDecDigit(ch)) {
                nextCh();
            }
            return 4; // DEC_LITERAL
        }
    } else if (ch == '+') {
        nextCh();
        return 5; // PLUS
    } else if (ch == '-') {
        nextCh();
        return 6; // MINUS
    } else if (ch == '*') {
        nextCh();
        return 7; // ASTERISK
    } else if (ch == '/') {
        nextCh();
        return 8; // SLASH
    } else if (ch == '%') {
        nextCh();
        return 9; // PERCENT
    } else if (ch == '=') {
        nextCh();
        return 10; // EQUAL
    } else if (ch == '(') {
        nextCh();
        return 11; // LPAREN
    } else if (ch == ')') {
        nextCh();
        return 12; // RPAREN
    } else if (ch == ';') {
        nextCh();
        return 13; // SEMICOLON
    } else if (isLetter(ch)) {
        do {
            nextCh();
        } while (isLetter(ch) || isDecDigit(ch));
        return 14; // IDENTIFIER
    }

    nextCh();
    return 1; // BAD_TOKEN
}