1 //===- Lexer.h - Lexer for the Toy language -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a simple Lexer for the Toy language. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef MLIR_TUTORIAL_TOY_LEXER_H_ 14 #define MLIR_TUTORIAL_TOY_LEXER_H_ 15 16 #include "llvm/ADT/StringRef.h" 17 18 #include <memory> 19 #include <string> 20 21 namespace toy { 22 23 /// Structure definition a location in a file. 24 struct Location { 25 std::shared_ptr<std::string> file; ///< filename. 26 int line; ///< line number. 27 int col; ///< column number. 28 }; 29 30 // List of Token returned by the lexer. 31 enum Token : int { 32 tok_semicolon = ';', 33 tok_parenthese_open = '(', 34 tok_parenthese_close = ')', 35 tok_bracket_open = '{', 36 tok_bracket_close = '}', 37 tok_sbracket_open = '[', 38 tok_sbracket_close = ']', 39 40 tok_eof = -1, 41 42 // commands 43 tok_return = -2, 44 tok_var = -3, 45 tok_def = -4, 46 47 // primary 48 tok_identifier = -5, 49 tok_number = -6, 50 }; 51 52 /// The Lexer is an abstract base class providing all the facilities that the 53 /// Parser expects. It goes through the stream one token at a time and keeps 54 /// track of the location in the file for debugging purposes. 55 /// It relies on a subclass to provide a `readNextLine()` method. The subclass 56 /// can proceed by reading the next line from the standard input or from a 57 /// memory mapped file. 58 class Lexer { 59 public: 60 /// Create a lexer for the given filename. The filename is kept only for 61 /// debugging purposes (attaching a location to a Token). Lexer(std::string filename)62 Lexer(std::string filename) 63 : lastLocation( 64 {std::make_shared<std::string>(std::move(filename)), 0, 0}) {} 65 virtual ~Lexer() = default; 66 67 /// Look at the current token in the stream. getCurToken()68 Token getCurToken() { return curTok; } 69 70 /// Move to the next token in the stream and return it. getNextToken()71 Token getNextToken() { return curTok = getTok(); } 72 73 /// Move to the next token in the stream, asserting on the current token 74 /// matching the expectation. consume(Token tok)75 void consume(Token tok) { 76 assert(tok == curTok && "consume Token mismatch expectation"); 77 getNextToken(); 78 } 79 80 /// Return the current identifier (prereq: getCurToken() == tok_identifier) getId()81 llvm::StringRef getId() { 82 assert(curTok == tok_identifier); 83 return identifierStr; 84 } 85 86 /// Return the current number (prereq: getCurToken() == tok_number) getValue()87 double getValue() { 88 assert(curTok == tok_number); 89 return numVal; 90 } 91 92 /// Return the location for the beginning of the current token. getLastLocation()93 Location getLastLocation() { return lastLocation; } 94 95 // Return the current line in the file. getLine()96 int getLine() { return curLineNum; } 97 98 // Return the current column in the file. getCol()99 int getCol() { return curCol; } 100 101 private: 102 /// Delegate to a derived class fetching the next line. Returns an empty 103 /// string to signal end of file (EOF). Lines are expected to always finish 104 /// with "\n" 105 virtual llvm::StringRef readNextLine() = 0; 106 107 /// Return the next character from the stream. This manages the buffer for the 108 /// current line and request the next line buffer to the derived class as 109 /// needed. getNextChar()110 int getNextChar() { 111 // The current line buffer should not be empty unless it is the end of file. 112 if (curLineBuffer.empty()) 113 return EOF; 114 ++curCol; 115 auto nextchar = curLineBuffer.front(); 116 curLineBuffer = curLineBuffer.drop_front(); 117 if (curLineBuffer.empty()) 118 curLineBuffer = readNextLine(); 119 if (nextchar == '\n') { 120 ++curLineNum; 121 curCol = 0; 122 } 123 return nextchar; 124 } 125 126 /// Return the next token from standard input. getTok()127 Token getTok() { 128 // Skip any whitespace. 129 while (isspace(lastChar)) 130 lastChar = Token(getNextChar()); 131 132 // Save the current location before reading the token characters. 133 lastLocation.line = curLineNum; 134 lastLocation.col = curCol; 135 136 // Identifier: [a-zA-Z][a-zA-Z0-9_]* 137 if (isalpha(lastChar)) { 138 identifierStr = (char)lastChar; 139 while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_') 140 identifierStr += (char)lastChar; 141 142 if (identifierStr == "return") 143 return tok_return; 144 if (identifierStr == "def") 145 return tok_def; 146 if (identifierStr == "var") 147 return tok_var; 148 return tok_identifier; 149 } 150 151 // Number: [0-9.]+ 152 if (isdigit(lastChar) || lastChar == '.') { 153 std::string numStr; 154 do { 155 numStr += lastChar; 156 lastChar = Token(getNextChar()); 157 } while (isdigit(lastChar) || lastChar == '.'); 158 159 numVal = strtod(numStr.c_str(), nullptr); 160 return tok_number; 161 } 162 163 if (lastChar == '#') { 164 // Comment until end of line. 165 do { 166 lastChar = Token(getNextChar()); 167 } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r'); 168 169 if (lastChar != EOF) 170 return getTok(); 171 } 172 173 // Check for end of file. Don't eat the EOF. 174 if (lastChar == EOF) 175 return tok_eof; 176 177 // Otherwise, just return the character as its ascii value. 178 Token thisChar = Token(lastChar); 179 lastChar = Token(getNextChar()); 180 return thisChar; 181 } 182 183 /// The last token read from the input. 184 Token curTok = tok_eof; 185 186 /// Location for `curTok`. 187 Location lastLocation; 188 189 /// If the current Token is an identifier, this string contains the value. 190 std::string identifierStr; 191 192 /// If the current Token is a number, this contains the value. 193 double numVal = 0; 194 195 /// The last value returned by getNextChar(). We need to keep it around as we 196 /// always need to read ahead one character to decide when to end a token and 197 /// we can't put it back in the stream after reading from it. 198 Token lastChar = Token(' '); 199 200 /// Keep track of the current line number in the input stream 201 int curLineNum = 0; 202 203 /// Keep track of the current column number in the input stream 204 int curCol = 0; 205 206 /// Buffer supplied by the derived class on calls to `readNextLine()` 207 llvm::StringRef curLineBuffer = "\n"; 208 }; 209 210 /// A lexer implementation operating on a buffer in memory. 211 class LexerBuffer final : public Lexer { 212 public: LexerBuffer(const char * begin,const char * end,std::string filename)213 LexerBuffer(const char *begin, const char *end, std::string filename) 214 : Lexer(std::move(filename)), current(begin), end(end) {} 215 216 private: 217 /// Provide one line at a time to the Lexer, return an empty string when 218 /// reaching the end of the buffer. readNextLine()219 llvm::StringRef readNextLine() override { 220 auto *begin = current; 221 while (current <= end && *current && *current != '\n') 222 ++current; 223 if (current <= end && *current) 224 ++current; 225 llvm::StringRef result{begin, static_cast<size_t>(current - begin)}; 226 return result; 227 } 228 const char *current, *end; 229 }; 230 } // namespace toy 231 232 #endif // MLIR_TUTORIAL_TOY_LEXER_H_ 233