Differences
This shows you the differences between two versions of the page.
| Both sides previous revision Previous revision Next revision | Previous revision | ||
| creating:syntax [2025/03/28 21:49] – Added more lexing code ahelwer | creating:syntax [2025/04/06 23:20] (current) – Deleting page ahelwer | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| - | |||
| - | TLA⁺ is a large, complicated, | ||
| - | This is assisted by [[https:// | ||
| - | This tutorial only uses a minimal subset of TLA⁺ syntax, just enough to handle [[https:// | ||
| - | While that may seem limiting, this tutorial tries to focus on the difficult & interesting parts of parsing the language instead of more mundane drudgework like handling all hundred-some user-definable operator symbols. | ||
| - | You are encouraged to extend this minimal core as you wish; language tooling is best developed incrementally! | ||
| - | Slowly filling in the details of this rough language sketch has a satisfying meditative aspect. | ||
| - | |||
| - | Here is what our minimal language subset includes: | ||
| - | * The '' | ||
| - | * Named parameterized operator definitions like '' | ||
| - | * Single-line comments like '' | ||
| - | * Declaration of '' | ||
| - | * Finite set literals, like '' | ||
| - | * The '' | ||
| - | * Some infix operators: '' | ||
| - | * The variable-priming suffix operator | ||
| - | * Parentheses to control expression grouping | ||
| - | * Vertically-aligned conjunction & disjunction lists | ||
| - | * The '' | ||
| - | * Natural numbers like '' | ||
| - | * Boolean values '' | ||
| - | |||
| - | As outlined above, you are free to add missing features (or even your own invented features!) as you wish. | ||
| - | |||
| - | ==== Preparation ==== | ||
| - | |||
| - | Read part I (the first three chapters) of free online textbook // | ||
| - | We will be closely following the material in this book, modifying it to our uses. | ||
| - | The first two chapters are a nice introduction and overview of language implementation generally. | ||
| - | Chapter three specifies a toy language called Lox, to be used as an object of study for the remainder of the book. | ||
| - | Our minimal TLA⁺ subset has some similarity to Lox with regard to expressions involving integers and booleans, but also differences - you can skip the section on closures (unless you want to implement higher-order operator parameters yourself) and the section on classes. | ||
| - | What's important is that it's similar enough for all the fundamentals to still apply! | ||
| - | |||
| - | Code snippets are shown often in this tutorial. | ||
| - | When a snippet is similar to code given in the book with the exception of a few lines, the differing lines will be highlighted. | ||
| - | Highlighting is also used to identify //new// code being added to a large section of code given previously, with surrounding lines included for context. | ||
| - | Sometimes the code given here is so different from that provided by the book that highlighting is simply ommitted to reduce visual noise. | ||
| - | A parenthetical will specify the meaning of highlighted code when ambiguous. | ||
| - | |||
| - | ==== Section 4.1: The Interpreter Framework ==== | ||
| - | |||
| - | [[https:// | ||
| - | Everything in [[https:// | ||
| - | You should thus have followed along and arrived at something similar to the below file '' | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | package com.craftinginterpreters.tla; | ||
| - | |||
| - | import java.io.BufferedReader; | ||
| - | import java.io.IOException; | ||
| - | import java.io.InputStreamReader; | ||
| - | import java.nio.charset.Charset; | ||
| - | import java.nio.file.Files; | ||
| - | import java.nio.file.Paths; | ||
| - | import java.util.List; | ||
| - | |||
| - | public class TlaPlus { | ||
| - | static boolean hadError = false; | ||
| - | |||
| - | public static void main(String[] args) throws IOException { | ||
| - | if (args.length > 1) { | ||
| - | System.out.println(" | ||
| - | System.exit(64); | ||
| - | } else if (args.length == 1) { | ||
| - | runFile(args[0]); | ||
| - | } else { | ||
| - | runPrompt(); | ||
| - | } | ||
| - | } | ||
| - | |||
| - | private static void runFile(String path) throws IOException { | ||
| - | byte[] bytes = Files.readAllBytes(Paths.get(path)); | ||
| - | run(new String(bytes, | ||
| - | |||
| - | // Indicate an error in the exit code. | ||
| - | if (hadError) System.exit(65); | ||
| - | } | ||
| - | |||
| - | private static void runPrompt() throws IOException { | ||
| - | InputStreamReader input = new InputStreamReader(System.in); | ||
| - | BufferedReader reader = new BufferedReader(input); | ||
| - | |||
| - | for (;;) { | ||
| - | System.out.print("> | ||
| - | String line = reader.readLine(); | ||
| - | if (line == null) break; | ||
| - | run(line); | ||
| - | hadError = false; | ||
| - | } | ||
| - | } | ||
| - | |||
| - | private static void run(String source) { | ||
| - | Scanner scanner = new Scanner(source); | ||
| - | List< | ||
| - | |||
| - | // For now, just print the tokens. | ||
| - | for (Token token : tokens) { | ||
| - | System.out.println(token); | ||
| - | } | ||
| - | } | ||
| - | |||
| - | static void error(int line, String message) { | ||
| - | report(line, | ||
| - | } | ||
| - | |||
| - | private static void report(int line, String where, | ||
| - | | ||
| - | System.err.println( | ||
| - | "[line " + line + "] Error" + where + ": " + message); | ||
| - | hadError = true; | ||
| - | } | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | ==== Section 4.2: Lexemes and Tokens ==== | ||
| - | |||
| - | The '' | ||
| - | Instead of Lox tokens, we use the atomic components of our minimal TLA⁺ language subset defined above. | ||
| - | Adapting the snippet in [[https:// | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | package com.craftinginterpreters.tla; | ||
| - | |||
| - | enum TokenType { | ||
| - | // Single-character tokens. | ||
| - | LEFT_PAREN, RIGHT_PAREN, | ||
| - | COMMA, MINUS, PLUS, LESS_THAN, NEGATION, PRIME | ||
| - | |||
| - | // One or two character tokens. | ||
| - | AND, OR, EQUALS, DEF_EQ, | ||
| - | |||
| - | // Literals. | ||
| - | IDENTIFIER, NAT_NUMBER, TRUE, FALSE, | ||
| - | |||
| - | // Keywords. | ||
| - | VARIABLES, ENABLED, IF, THEN, ELSE, IN, | ||
| - | SINGLE_LINE, | ||
| - | |||
| - | EOF | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | There is a very minor design decision here, of the type encountered innumerable times when writing a parser. | ||
| - | TLA⁺ includes boolean values '' | ||
| - | However, since we are writing our interpreter in Java and Java has primitive boolean literals, it is convenient to add specialized tokens for '' | ||
| - | The existing TLA⁺ tools do not take this approach, and instead later resolve the identifiers '' | ||
| - | This sort of works-either-way design dilemma occurs often, and we will see it again when trying to decide whether to disallow a snippet of invalid TLA⁺ at the syntactic or semantic level. | ||
| - | |||
| - | In [[https:// | ||
| - | This will come in useful when parsing vertically-aligned conjunction & disjunction lists. | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | package com.craftinginterpreters.tla; | ||
| - | |||
| - | class Token { | ||
| - | final TokenType type; | ||
| - | final String lexeme; | ||
| - | final Object literal; | ||
| - | final int line; | ||
| - | final int column; | ||
| - | |||
| - | Token(TokenType type, String lexeme, Object literal, int line, int column) { | ||
| - | this.type = type; | ||
| - | this.lexeme = lexeme; | ||
| - | this.literal = literal; | ||
| - | this.line = line; | ||
| - | this.column = column; | ||
| - | } | ||
| - | |||
| - | public String toString() { | ||
| - | return type + " " + lexeme + " " + literal; | ||
| - | } | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | ==== Section 4.4: The Scanner Class ==== | ||
| - | |||
| - | We now move on to the very important '' | ||
| - | Our first modification to the code given in the book is to track the column in addition to the line, mirroring our addition to the '' | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | package com.craftinginterpreters.tla; | ||
| - | |||
| - | import java.util.ArrayList; | ||
| - | import java.util.HashMap; | ||
| - | import java.util.List; | ||
| - | import java.util.Map; | ||
| - | |||
| - | import static com.craftinginterpreters.tla.TokenType.*; | ||
| - | |||
| - | class Scanner { | ||
| - | private final String source; | ||
| - | private final List< | ||
| - | private int start = 0; | ||
| - | private int current = 0; | ||
| - | private int line = 1; | ||
| - | private int column = 0; | ||
| - | |||
| - | Scanner(String source) { | ||
| - | this.source = source; | ||
| - | } | ||
| - | |||
| - | List< | ||
| - | while (!isAtEnd()) { | ||
| - | // We are at the beginning of the next lexeme. | ||
| - | start = current; | ||
| - | scanToken(); | ||
| - | } | ||
| - | |||
| - | tokens.add(new Token(EOF, "", | ||
| - | return tokens; | ||
| - | } | ||
| - | |||
| - | private boolean isAtEnd() { | ||
| - | return current >= source.length(); | ||
| - | } | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | ==== Section 4.5: Recognizing Lexemes ==== | ||
| - | |||
| - | [[https:// | ||
| - | Of course, our unambiguous single-character lexing code in '' | ||
| - | Note also that we increment the column position in '' | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | private void scanToken() { | ||
| - | char c = advance(); | ||
| - | switch (c) { | ||
| - | case ' | ||
| - | case ' | ||
| - | case ' | ||
| - | case ' | ||
| - | case ',': | ||
| - | case ' | ||
| - | case ' | ||
| - | case '<': | ||
| - | case ' | ||
| - | case ' | ||
| - | default: | ||
| - | TlaPlus.error(line, | ||
| - | break; | ||
| - | } | ||
| - | } | ||
| - | |||
| - | private char advance() { | ||
| - | column++; | ||
| - | return source.charAt(current++); | ||
| - | } | ||
| - | |||
| - | private void addToken(TokenType type) { | ||
| - | addToken(type, | ||
| - | } | ||
| - | |||
| - | private void addToken(TokenType type, Object literal) { | ||
| - | String text = source.substring(start, | ||
| - | tokens.add(new Token(type, text, literal, line, column)); | ||
| - | } | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | Moving on to lexing tokens of one or two characters, the differences widen considerably; | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | case ' | ||
| - | case ' | ||
| - | addToken(match(' | ||
| - | break; | ||
| - | case '/': | ||
| - | if (match(' | ||
| - | addToken(AND); | ||
| - | } else { | ||
| - | TlaPlus.error(line, | ||
| - | } | ||
| - | break; | ||
| - | case ' | ||
| - | if (match('/' | ||
| - | addToken(OR); | ||
| - | } else { | ||
| - | TlaPlus.error(line, | ||
| - | } | ||
| - | break; | ||
| - | default: | ||
| - | TlaPlus.error(line, | ||
| - | break; | ||
| - | </ | ||
| - | |||
| - | Be sure to include the '' | ||
| - | |||
| - | <code java [highlight_lines_extra=" | ||
| - | private boolean match(char expected) { | ||
| - | if (isAtEnd()) return false; | ||
| - | if (source.charAt(current) != expected) return false; | ||
| - | |||
| - | column++; | ||
| - | current++; | ||
| - | return true; | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | ==== Section 4.6: Longer Lexemes ==== | ||
| - | |||
| - | In [[https:// | ||
| - | Let's begin with single-line comments, which in TLA⁺ start with '' | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | case ' | ||
| - | if (match('/' | ||
| - | addToken(OR); | ||
| - | } else if (match(' | ||
| - | // A comment goes until the end of the line. | ||
| - | while (peek() != ' | ||
| - | } else { | ||
| - | TlaPlus.error(line, | ||
| - | } | ||
| - | break; | ||
| - | </ | ||
| - | |||
| - | This uses the '' | ||
| - | |||
| - | <code java> | ||
| - | private char peek() { | ||
| - | if (isAtEnd()) return ' | ||
| - | return source.charAt(current); | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | Also add the code to skip whitespace; importantly, | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | break; | ||
| - | |||
| - | case ' ': | ||
| - | case ' | ||
| - | case ' | ||
| - | // Ignore whitespace. | ||
| - | break; | ||
| - | |||
| - | case ' | ||
| - | column = 0; | ||
| - | line++; | ||
| - | break; | ||
| - | |||
| - | default: | ||
| - | TlaPlus.error(line, | ||
| - | break; | ||
| - | </ | ||
| - | |||
| - | We aren't supporting strings (unless you want to go the extra mile) so next up is numbers: | ||
| - | |||
| - | <code java [enable_line_numbers=" | ||
| - | default: | ||
| - | if (isDigit(c)) { | ||
| - | number(); | ||
| - | } else { | ||
| - | TlaPlus.error(line, | ||
| - | } | ||
| - | break; | ||
| - | </ | ||
| - | |||
| - | Include the '' | ||
| - | |||
| - | <code java> | ||
| - | private boolean isDigit(char c) { | ||
| - | return c >= ' | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | We're only handling whole natural numbers, no decimals, so our '' | ||
| - | |||
| - | <code java> | ||
| - | private void number() { | ||
| - | while (isDigit(peek())) advance(); | ||
| - | addToken(NAT_NUMBER, | ||
| - | Integer.parseInt(source.substring(start, | ||
| - | } | ||
| - | </ | ||
| - | |||
| - | ==== Section 4.7: Reserved Words and Identifiers ==== | ||