package pins25.phase; import java.io.*; import pins25.common.*; /** * Leksikalni analizator. */ public class LexAn implements AutoCloseable { /** * Izvorna datoteka. */ private final Reader srcFile; /** * Ustvari nov leksikalni analizator. * * @param srcFileName Ime izvorne datoteke. */ public LexAn(final String srcFileName) { try { srcFile = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFileName)))); nextChar(); // Pripravi prvi znak izvorne datoteke (glej {@link nextChar}). } catch (FileNotFoundException __) { throw new Report.Error("Source file '" + srcFileName + "' not found."); } } @Override public void close() { try { srcFile.close(); } catch (IOException __) { throw new Report.Error("Cannot close source file."); } } /** * Trenutni znak izvorne datoteke (glej {@link nextChar}). */ private int buffChar = -2; /** * Vrstica trenutnega znaka izvorne datoteke (glej {@link nextChar}). */ private int buffCharLine = 0; /** * Stolpec trenutnega znaka izvorne datoteke (glej {@link nextChar}). */ private int buffCharColumn = 0; /** * Prebere naslednji znak izvorne datoteke. *
* Izvorno datoteko beremo znak po znak. Trenutni znak izvorne datoteke je * shranjen v spremenljivki {@link buffChar}, vrstica in stolpec trenutnega * znaka izvorne datoteke sta shranjena v spremenljivkah {@link buffCharLine} in * {@link buffCharColumn}. *
* Zacetne vrednosti {@link buffChar}, {@link buffCharLine} in * {@link buffCharColumn} so {@code '\n'}, {@code 0} in {@code 0}: branje prvega * znaka izvorne datoteke bo na osnovi vrednosti {@code '\n'} spremenljivke * {@link buffChar} prvemu znaku izvorne datoteke priredilo vrstico 1 in stolpec * 1. *
* Pri branju izvorne datoteke se predpostavlja, da je v spremenljivki * {@link buffChar} ves "cas veljaven znak. Zunaj metode {@link nextChar} so vse * spremenljivke {@link buffChar}, {@link buffCharLine} in * {@link buffCharColumn} namenjene le branju. *
* Vrednost {@code -1} v spremenljivki {@link buffChar} pomeni konec datoteke * (vrednosti spremenljivk {@link buffCharLine} in {@link buffCharColumn} pa * nista ve"c veljavni). */ private void nextChar() { try { switch (buffChar) { case -2: // Noben znak "se ni bil prebran. buffChar = srcFile.read(); buffCharLine = buffChar == -1 ? 0 : 1; buffCharColumn = buffChar == -1 ? 0 : 1; return; case -1: // Konec datoteke je bil ze viden. return; case '\n': // Prejsnji znak je koncal vrstico, zacne se nova vrstica. buffChar = srcFile.read(); buffCharLine = buffChar == -1 ? buffCharLine : buffCharLine + 1; buffCharColumn = buffChar == -1 ? buffCharColumn : 1; return; case '\t': // Prejsnji znak je tabulator, ta znak je morda potisnjen v desno. buffChar = srcFile.read(); while (buffCharColumn % 4 != 0) buffCharColumn += 1; buffCharColumn += 1; return; default: // Prejsnji znak je brez posebnosti. buffChar = srcFile.read(); buffCharColumn += 1; } } catch (IOException __) { throw new Report.Error("Cannot read source file."); } } private Report.Location currentLocation() { return new Report.Location(buffCharLine, buffCharColumn); } /** * Trenutni leksikalni simbol. *
* "Ce vrednost spremenljivke {@code buffToken} ni {@code null}, je simbol "ze * prebran iz vhodne datoteke, ni pa "se predan naprej sintaksnemu analizatorju. * Ta simbol je dostopen z metodama {@link peekToken} in {@link takeToken}. */ private Token buffToken = null; /** * Prebere naslednji leksikalni simbol, ki je nato dostopen preko metod * {@link peekToken} in {@link takeToken}. */ private void nextToken() { while (buffChar == ' ' || buffChar == '\n' || buffChar == '\t' || buffChar == '\r') { nextChar(); } Report.Location start = currentLocation(); switch (buffChar) { case -1: // EOF buffToken = new Token(start, Token.Symbol.EOF, "EOF"); return; case '\'': charConst(); return; case '"': stringConst(); return; case '=': nextChar(); if (buffChar == '=') { buffToken = new Token( new Report.Location(start, currentLocation()), Token.Symbol.EQU, "==" ); nextChar(); return; } buffToken = new Token(start, Token.Symbol.ASSIGN, "="); return; case ',': buffToken = new Token(currentLocation(), Token.Symbol.COMMA, ","); nextChar(); return; case '&': nextChar(); if (buffChar != '&') { throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'"); } buffToken = new Token(new Report.Location(start, currentLocation()), Token.Symbol.AND, "&&"); nextChar(); return; case '|': nextChar(); if (buffChar != '|') { throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'"); } buffToken = new Token(new Report.Location(start, currentLocation()), Token.Symbol.OR, "||"); nextChar(); return; case '!': nextChar(); if (buffChar == '=') { buffToken = new Token( new Report.Location(start, currentLocation()), Token.Symbol.NEQ, "!=" ); nextChar(); return; } buffToken = new Token(start, Token.Symbol.NOT, "!"); return; case '>': nextChar(); if (buffChar == '=') { buffToken = new Token( new Report.Location(start, currentLocation()), Token.Symbol.GEQ, ">=" ); nextChar(); return; } buffToken = new Token(start, Token.Symbol.GTH, ">"); return; case '<': nextChar(); if (buffChar == '=') { buffToken = new Token( new Report.Location(start, currentLocation()), Token.Symbol.LEQ, "<=" ); nextChar(); return; } buffToken = new Token(start, Token.Symbol.LTH, "<"); return; case '+': buffToken = new Token(currentLocation(), Token.Symbol.ADD, "+"); nextChar(); return; case '-': buffToken = new Token(currentLocation(), Token.Symbol.SUB, "-"); nextChar(); return; case '*': buffToken = new Token(currentLocation(), Token.Symbol.MUL, "*"); nextChar(); return; case '/': start = currentLocation(); nextChar(); if (buffChar != '/') { buffToken = new Token(start, Token.Symbol.DIV, "/"); return; } while (buffChar != '\n' && buffChar != -1) { nextChar(); } nextToken(); return; case '%': buffToken = new Token(currentLocation(), Token.Symbol.MOD, "%"); nextChar(); return; case '^': buffToken = new Token(currentLocation(), Token.Symbol.PTR, "^"); nextChar(); return; case '(': buffToken = new Token(currentLocation(), Token.Symbol.LPAREN, "("); nextChar(); return; case ')': buffToken = new Token(currentLocation(), Token.Symbol.RPAREN, ")"); nextChar(); return; } if (isNumeric()) { intConst(); return; } if (isAlpha()) { identifier(); return; } throw new Report.Error(currentLocation(), "Unrecognized character '" + (char) buffChar + "'."); } private boolean isNumeric() { return buffChar >= '0' && buffChar <= '9'; } private boolean isChar() { return buffChar >= ' ' && buffChar <= '~'; } private boolean isHex() { return buffChar >= '0' && buffChar <= '9' || buffChar >= 'a' && buffChar <= 'f'; } private boolean isAlpha() { return buffChar >= 'a' && buffChar <= 'z' || buffChar >= 'A' && buffChar <= 'Z' || buffChar == '_'; } private boolean isAlphaNumeric() { return buffChar >= 'a' && buffChar <= 'z' || buffChar >= 'A' && buffChar <= 'Z' || buffChar >= '0' && buffChar <= '9' || buffChar == '_'; } private Token.Symbol getReservedWordSymbol(String word) { return switch (word) { case "fun" -> Token.Symbol.FUN; case "var" -> Token.Symbol.VAR; case "if" -> Token.Symbol.IF; case "then" -> Token.Symbol.THEN; case "else" -> Token.Symbol.ELSE; case "while" -> Token.Symbol.WHILE; case "do" -> Token.Symbol.DO; case "let" -> Token.Symbol.LET; case "in" -> Token.Symbol.IN; case "end" -> Token.Symbol.END; case "__LINE__" -> Token.Symbol.LINE; default -> null; }; } private void intConst() { Report.Location startLocation = currentLocation(); Report.Location endLocation = currentLocation(); StringBuilder lexeme = new StringBuilder(); if (buffChar == '0') { lexeme.append((char) buffChar); nextChar(); if (isNumeric()) { throw new Report.Error(startLocation, "Leading zero is not allowed."); } } else { while (isNumeric()) { lexeme.append((char) buffChar); endLocation = currentLocation(); nextChar(); } } buffToken = new Token( new Report.Location(startLocation, endLocation), Token.Symbol.INTCONST, lexeme.toString() ); } private void charConst() { Report.Location startLocation = currentLocation(); StringBuilder lexeme = new StringBuilder(); lexeme.append((char) buffChar); nextChar(); if (!isChar()) { throw new Report.Error(startLocation, "Invalid character '" + (char) buffChar + "'."); } lexeme.append((char) buffChar); if (buffChar == '\\') { nextChar(); lexeme.append((char) buffChar); if (buffChar == 'n' || buffChar == '\\' || buffChar == '\'') { } else if (isHex()) { nextChar(); lexeme.append((char) buffChar); if (!isHex()) { throw new Report.Error(currentLocation(), "Invalid ascii code '" + (char) buffChar + "'."); } } else { throw new Report.Error(currentLocation(), "Invalid escaped character '" + (char) buffChar + "'."); } } nextChar(); if (buffChar != '\'') { throw new Report.Error(new Report.Location(startLocation, currentLocation()), "Invalid character '" + (char) buffChar + "'."); } lexeme.append((char) buffChar); buffToken = new Token( new Report.Location(startLocation, currentLocation()), Token.Symbol.CHARCONST, lexeme.toString() ); nextChar(); } private void stringConst() { Report.Location startLocation = currentLocation(); StringBuilder lexeme = new StringBuilder(); lexeme.append((char) buffChar); nextChar(); while (buffChar != '"') { if (buffChar == '\n' || buffChar == -1) { throw new Report.Error(currentLocation(), "Unterminated string."); } if (buffChar == '\\') { lexeme.append((char) buffChar); nextChar(); if (buffChar == 'n' || buffChar == '\\' || buffChar == '"') { } else if (isHex()) { lexeme.append((char) buffChar); nextChar(); if (!isHex()) { throw new Report.Error(currentLocation(), "Invalid ascii code '" + (char) buffChar + "'."); } } else { throw new Report.Error(currentLocation(), "Invalid escaped character '" + (char) buffChar + "'."); } } else if (!isChar()) { throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'."); } lexeme.append((char) buffChar); nextChar(); } lexeme.append((char) buffChar); buffToken = new Token( new Report.Location(startLocation, currentLocation()), Token.Symbol.STRINGCONST, lexeme.toString() ); nextChar(); } private void identifier() { Report.Location startLocation = currentLocation(); Report.Location endLocation = currentLocation(); StringBuilder lexeme = new StringBuilder(); while (isAlphaNumeric()) { lexeme.append((char) buffChar); endLocation = currentLocation(); nextChar(); } Token.Symbol symbol = getReservedWordSymbol(lexeme.toString()); if (symbol == null) { symbol = Token.Symbol.IDENTIFIER; } if (symbol == Token.Symbol.LINE) { buffToken = new Token( new Report.Location(startLocation, endLocation), Token.Symbol.INTCONST, Integer.toString(startLocation.begLine()) ); return; } buffToken = new Token( new Report.Location(startLocation, endLocation), symbol, lexeme.toString() ); } /** * Vrne trenutni leksikalni simbol, ki ostane v lastnistvu leksikalnega * analizatorja. * * @return Leksikalni simbol. */ public Token peekToken() { if (buffToken == null) nextToken(); return buffToken; } /** * Vrne trenutni leksikalni simbol, ki preide v lastnistvo klicoce kode. * * @return Leksikalni simbol. */ public Token takeToken() { if (buffToken == null) nextToken(); final Token thisToken = buffToken; buffToken = null; return thisToken; } // --- ZAGON --- /** * Zagon leksikalnega analizatorja kot samostojnega programa. * * @param cmdLineArgs Argumenti v ukazni vrstici. */ public static void main(final String[] cmdLineArgs) { System.out.println("This is PINS'25 compiler (lexical analysis):"); try { if (cmdLineArgs.length == 0) throw new Report.Error("No source file specified in the command line."); if (cmdLineArgs.length > 1) Report.warning("Unused arguments in the command line."); try (LexAn lexAn = new LexAn(cmdLineArgs[0])) { while (lexAn.peekToken().symbol() != Token.Symbol.EOF) System.out.println(lexAn.takeToken()); System.out.println(lexAn.takeToken()); } // Upajmo, da kdaj pridemo to te tocke. // A zavedajmo se sledecega: // 1. Prevod je zaradi napak v programu lahko napacen :-o // 2. Izvorni program se zdalec ni tisto, kar je programer hotel, da bi bil ;-) Report.info("Done."); } catch (Report.Error error) { // Izpis opisa napake. System.err.println(error.getMessage()); System.exit(1); } } }