1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
/*
   Copyright (C) 2009-2019 Andreas Franz Borchert
                 2019 Michael Lehn
   ----------------------------------------------------------------------------
   Astl-ULMcalc is free software; you can redistribute it
   and/or modify it under the terms of the GNU Library General Public
   License as published by the Free Software Foundation; either version
   2 of the License, or (at your option) any later version.

   Astl-ULMcalc is distributed in the hope that it will be
   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/

#include <cassert>
#include <cwchar>
#include <locale>
#include <memory>
#include <astl/syntax-tree.hpp>
#include <astl/token.hpp>
#include <astl/utf8.hpp>
#include "error.hpp"
#include "keywords.hpp"
#include "location.hpp"
#include "scanner.hpp"

using namespace Astl;

namespace AstlULMcalc {

// private functions =========================================================

bool is_digit(char ch) {
   return (ch >= '0') && (ch <= '9');
}

bool is_whitespace(char ch) {
   return ch == ' ';
}

// constructor ===============================================================

Scanner::Scanner(std::istream& in, const std::string& input_name) :
      in(in), input_name(input_name), ch(0), eof(false),
      lasttoken(0), tokenstr(nullptr) {
   pos.initialize(&this->input_name);
   nextch();
}

// mutator ===================================================================

int Scanner::get_token(semantic_type& yylval, location& yylloc) {
   yylval = NodePtr(nullptr);

   int token = 0;
   tokenloc.begin = oldpos;
   while (is_whitespace(ch)) {
       nextch();
   }
   if (is_digit(ch)) {
      tokenstr = std::make_unique<std::string>();
      if (ch == '0') {
         // representation? Decimal in this example!
         nextch();
      } else {
         do {
            nextch();
         } while (is_digit(ch));
      }
      token = parser::token::DECIMAL_CONSTANT;
      yylval = std::make_shared<Node>(make_loc(tokenloc),
         Token(token, std::move(tokenstr)));
   } else if (ch == '+') {
      nextch(); token = parser::token::PLUS;
   } else if (ch == '-') {
      nextch(); token = parser::token::MINUS;
   } else if (ch == '\n') {
       // eof 
   } else {
      error("invalid token");
   }

   yylloc = tokenloc;
   lasttoken = token;
   return token;
}

// private methods ===========================================================

/* get next character from the input stream, if available;
   pos gets updated */
void Scanner::nextch() {
   tokenloc.end = oldpos;
   oldpos = pos;
   if (eof) {
      ch = 0; return;
   }
   if (tokenstr != nullptr) {
      *tokenstr += ch;
   }
   char c;
   if (!in.get(c)) {
      eof = true; ch = 0; return;
   }
   ch = c;
   if (ch == '\n') {
      pos.lines();
   } else if (ch == '\t') {
      /* assume regular tab stop distance of 8 */
      pos.columns(8 - (pos.column-1) % 8);
   } else {
      pos.columns();
   }
}

/* convert tokenstr to utf8, if necessary;
   we need this as the Astl printing engine assumes
   that all tokens are encoded in utf8
   (if the assumption fails an exception is thrown) */
void Scanner::convert_to_utf8() {
   if (tokenstr) {
      /* if the input looks like valid utf8 we accept it unchanged
         independent from the input locale */
      std::size_t len = 0; bool valid = true;
      try {
         #pragma GCC diagnostic push
         #pragma GCC diagnostic ignored "-Wunused-variable"
         for (auto ch: codepoint_range(*tokenstr)) {
            ++len;
         }
         #pragma GCC diagnostic pop
      } catch (utf8_error&) {
         valid = false;
      }
      if (!valid) {
         /* attempt to convert it into char32_t characters
            using the the corresponding facet of the locale
            of the input stream */
         using codecvt = std::codecvt<char32_t, char, std::mbstate_t>;
         auto locale = in.getloc();
         if (std::has_facet<codecvt>(locale)) {
            auto& facet = std::use_facet<codecvt>(locale);
            std::u32string str32(tokenstr->size(), U'\0');
            std::mbstate_t mb{};
            const char* from_next = nullptr; char32_t* to_next = nullptr;
            auto result = facet.in(mb,
               &(*tokenstr)[0], &(*tokenstr)[tokenstr->size()], from_next,
               &str32[0], &str32[str32.size()], to_next);
            if (result == std::codecvt_base::ok) {
               assert(to_next >= &str32[0]);
               str32.resize(to_next - &str32[0]);
               /* store utf8 conversion back in tokenstr */
               tokenstr->clear();
               for (auto codepoint: str32) {
                  add_codepoint(*tokenstr, codepoint);
               }
               valid = true;
            }
         }
         /* as a last resort we assume ISO-8859-1 and do
            the conversion ourselves */
         if (!valid) {
            std::string copy(*tokenstr);
            tokenstr->clear();
            for (auto ch: copy) {
               char32_t codepoint = static_cast<unsigned char>(ch);
               add_codepoint(*tokenstr, codepoint);
            }
         }
      }
   }
}

void Scanner::error(char const* msg) {
   yyerror(&tokenloc, msg);
}

} // namespace AstlULMcalc