pns/src/pins25/phase/LexAn.java
2025-06-03 18:07:27 +02:00

530 lines
17 KiB
Java

package pins25.phase;
import java.io.*;
import pins25.common.*;
/**
* Leksikalni analizator.
*/
public class LexAn implements AutoCloseable {
/**
* Izvorna datoteka.
*/
private final Reader srcFile;
/**
* Ustvari nov leksikalni analizator.
*
* @param srcFileName Ime izvorne datoteke.
*/
public LexAn(final String srcFileName) {
try {
srcFile = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFileName))));
nextChar(); // Pripravi prvi znak izvorne datoteke (glej {@link nextChar}).
} catch (FileNotFoundException __) {
throw new Report.Error("Source file '" + srcFileName + "' not found.");
}
}
@Override
public void close() {
try {
srcFile.close();
} catch (IOException __) {
throw new Report.Error("Cannot close source file.");
}
}
/**
* Trenutni znak izvorne datoteke (glej {@link nextChar}).
*/
private int buffChar = -2;
/**
* Vrstica trenutnega znaka izvorne datoteke (glej {@link nextChar}).
*/
private int buffCharLine = 0;
/**
* Stolpec trenutnega znaka izvorne datoteke (glej {@link nextChar}).
*/
private int buffCharColumn = 0;
/**
* Prebere naslednji znak izvorne datoteke.
* <p>
* Izvorno datoteko beremo znak po znak. Trenutni znak izvorne datoteke je
* shranjen v spremenljivki {@link buffChar}, vrstica in stolpec trenutnega
* znaka izvorne datoteke sta shranjena v spremenljivkah {@link buffCharLine} in
* {@link buffCharColumn}.
* <p>
* Zacetne vrednosti {@link buffChar}, {@link buffCharLine} in
* {@link buffCharColumn} so {@code '\n'}, {@code 0} in {@code 0}: branje prvega
* znaka izvorne datoteke bo na osnovi vrednosti {@code '\n'} spremenljivke
* {@link buffChar} prvemu znaku izvorne datoteke priredilo vrstico 1 in stolpec
* 1.
* <p>
* Pri branju izvorne datoteke se predpostavlja, da je v spremenljivki
* {@link buffChar} ves "cas veljaven znak. Zunaj metode {@link nextChar} so vse
* spremenljivke {@link buffChar}, {@link buffCharLine} in
* {@link buffCharColumn} namenjene le branju.
* <p>
* Vrednost {@code -1} v spremenljivki {@link buffChar} pomeni konec datoteke
* (vrednosti spremenljivk {@link buffCharLine} in {@link buffCharColumn} pa
* nista ve"c veljavni).
*/
private void nextChar() {
try {
switch (buffChar) {
case -2: // Noben znak "se ni bil prebran.
buffChar = srcFile.read();
buffCharLine = buffChar == -1 ? 0 : 1;
buffCharColumn = buffChar == -1 ? 0 : 1;
return;
case -1: // Konec datoteke je bil ze viden.
return;
case '\n': // Prejsnji znak je koncal vrstico, zacne se nova vrstica.
buffChar = srcFile.read();
buffCharLine = buffChar == -1 ? buffCharLine : buffCharLine + 1;
buffCharColumn = buffChar == -1 ? buffCharColumn : 1;
return;
case '\t': // Prejsnji znak je tabulator, ta znak je morda potisnjen v desno.
buffChar = srcFile.read();
while (buffCharColumn % 4 != 0)
buffCharColumn += 1;
buffCharColumn += 1;
return;
default: // Prejsnji znak je brez posebnosti.
buffChar = srcFile.read();
buffCharColumn += 1;
}
} catch (IOException __) {
throw new Report.Error("Cannot read source file.");
}
}
private Report.Location currentLocation() {
return new Report.Location(buffCharLine, buffCharColumn);
}
/**
* Trenutni leksikalni simbol.
* <p>
* "Ce vrednost spremenljivke {@code buffToken} ni {@code null}, je simbol "ze
* prebran iz vhodne datoteke, ni pa "se predan naprej sintaksnemu analizatorju.
* Ta simbol je dostopen z metodama {@link peekToken} in {@link takeToken}.
*/
private Token buffToken = null;
/**
* Prebere naslednji leksikalni simbol, ki je nato dostopen preko metod
* {@link peekToken} in {@link takeToken}.
*/
private void nextToken() {
while (buffChar == ' ' || buffChar == '\n' || buffChar == '\t' || buffChar == '\r') {
nextChar();
}
Report.Location start = currentLocation();
switch (buffChar) {
case -1: // EOF
buffToken = new Token(start, Token.Symbol.EOF, "EOF");
return;
case '\'':
charConst();
return;
case '"':
stringConst();
return;
case '=':
nextChar();
if (buffChar == '=') {
buffToken = new Token(
new Report.Location(start, currentLocation()),
Token.Symbol.EQU,
"=="
);
nextChar();
return;
}
buffToken = new Token(start, Token.Symbol.ASSIGN, "=");
return;
case ',':
buffToken = new Token(currentLocation(), Token.Symbol.COMMA, ",");
nextChar();
return;
case '&':
nextChar();
if (buffChar != '&') {
throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'");
}
buffToken = new Token(new Report.Location(start, currentLocation()), Token.Symbol.AND, "&&");
nextChar();
return;
case '|':
nextChar();
if (buffChar != '|') {
throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'");
}
buffToken = new Token(new Report.Location(start, currentLocation()), Token.Symbol.OR, "||");
nextChar();
return;
case '!':
nextChar();
if (buffChar == '=') {
buffToken = new Token(
new Report.Location(start, currentLocation()),
Token.Symbol.NEQ,
"!="
);
nextChar();
return;
}
buffToken = new Token(start, Token.Symbol.NOT, "!");
return;
case '>':
nextChar();
if (buffChar == '=') {
buffToken = new Token(
new Report.Location(start, currentLocation()),
Token.Symbol.GEQ,
">="
);
nextChar();
return;
}
buffToken = new Token(start, Token.Symbol.GTH, ">");
return;
case '<':
nextChar();
if (buffChar == '=') {
buffToken = new Token(
new Report.Location(start, currentLocation()),
Token.Symbol.LEQ,
"<="
);
nextChar();
return;
}
buffToken = new Token(start, Token.Symbol.LTH, "<");
return;
case '+':
buffToken = new Token(currentLocation(), Token.Symbol.ADD, "+");
nextChar();
return;
case '-':
buffToken = new Token(currentLocation(), Token.Symbol.SUB, "-");
nextChar();
return;
case '*':
buffToken = new Token(currentLocation(), Token.Symbol.MUL, "*");
nextChar();
return;
case '/':
start = currentLocation();
nextChar();
if (buffChar != '/') {
buffToken = new Token(start, Token.Symbol.DIV, "/");
return;
}
while (buffChar != '\n' && buffChar != -1) {
nextChar();
}
nextToken();
return;
case '%':
buffToken = new Token(currentLocation(), Token.Symbol.MOD, "%");
nextChar();
return;
case '^':
buffToken = new Token(currentLocation(), Token.Symbol.PTR, "^");
nextChar();
return;
case '(':
buffToken = new Token(currentLocation(), Token.Symbol.LPAREN, "(");
nextChar();
return;
case ')':
buffToken = new Token(currentLocation(), Token.Symbol.RPAREN, ")");
nextChar();
return;
}
if (isNumeric()) {
intConst();
return;
}
if (isAlpha()) {
identifier();
return;
}
throw new Report.Error(currentLocation(), "Unrecognized character '" + (char) buffChar + "'.");
}
private boolean isNumeric() {
return buffChar >= '0' && buffChar <= '9';
}
private boolean isChar() {
return buffChar >= ' ' && buffChar <= '~';
}
private boolean isHex() {
return buffChar >= '0' && buffChar <= '9' || buffChar >= 'a' && buffChar <= 'f';
}
private boolean isAlpha() {
return buffChar >= 'a' && buffChar <= 'z' || buffChar >= 'A' && buffChar <= 'Z' || buffChar == '_';
}
private boolean isAlphaNumeric() {
return buffChar >= 'a' && buffChar <= 'z' || buffChar >= 'A' && buffChar <= 'Z' || buffChar >= '0' && buffChar <= '9' || buffChar == '_';
}
private Token.Symbol getReservedWordSymbol(String word) {
return switch (word) {
case "fun" -> Token.Symbol.FUN;
case "var" -> Token.Symbol.VAR;
case "if" -> Token.Symbol.IF;
case "then" -> Token.Symbol.THEN;
case "else" -> Token.Symbol.ELSE;
case "while" -> Token.Symbol.WHILE;
case "do" -> Token.Symbol.DO;
case "let" -> Token.Symbol.LET;
case "in" -> Token.Symbol.IN;
case "end" -> Token.Symbol.END;
case "__LINE__" -> Token.Symbol.LINE;
default -> null;
};
}
private void intConst() {
Report.Location startLocation = currentLocation();
Report.Location endLocation = currentLocation();
StringBuilder lexeme = new StringBuilder();
if (buffChar == '0') {
lexeme.append((char) buffChar);
nextChar();
if (isNumeric()) {
throw new Report.Error(startLocation, "Leading zero is not allowed.");
}
} else {
while (isNumeric()) {
lexeme.append((char) buffChar);
endLocation = currentLocation();
nextChar();
}
}
buffToken = new Token(
new Report.Location(startLocation, endLocation),
Token.Symbol.INTCONST,
lexeme.toString()
);
}
private void charConst() {
Report.Location startLocation = currentLocation();
StringBuilder lexeme = new StringBuilder();
lexeme.append((char) buffChar);
nextChar();
if (!isChar()) {
throw new Report.Error(startLocation, "Invalid character '" + (char) buffChar + "'.");
}
lexeme.append((char) buffChar);
if (buffChar == '\\') {
nextChar();
lexeme.append((char) buffChar);
if (buffChar == 'n' || buffChar == '\\' || buffChar == '\'') {
} else if (isHex()) {
nextChar();
lexeme.append((char) buffChar);
if (!isHex()) {
throw new Report.Error(currentLocation(), "Invalid ascii code '" + (char) buffChar + "'.");
}
} else {
throw new Report.Error(currentLocation(), "Invalid escaped character '" + (char) buffChar + "'.");
}
}
nextChar();
if (buffChar != '\'') {
throw new Report.Error(new Report.Location(startLocation, currentLocation()), "Invalid character '" + (char) buffChar + "'.");
}
lexeme.append((char) buffChar);
buffToken = new Token(
new Report.Location(startLocation, currentLocation()),
Token.Symbol.CHARCONST,
lexeme.toString()
);
nextChar();
}
private void stringConst() {
Report.Location startLocation = currentLocation();
StringBuilder lexeme = new StringBuilder();
lexeme.append((char) buffChar);
nextChar();
while (buffChar != '"') {
if (buffChar == '\n' || buffChar == -1) {
throw new Report.Error(currentLocation(), "Unterminated string.");
}
if (buffChar == '\\') {
lexeme.append((char) buffChar);
nextChar();
if (buffChar == 'n' || buffChar == '\\' || buffChar == '"') {
} else if (isHex()) {
lexeme.append((char) buffChar);
nextChar();
if (!isHex()) {
throw new Report.Error(currentLocation(), "Invalid ascii code '" + (char) buffChar + "'.");
}
} else {
throw new Report.Error(currentLocation(), "Invalid escaped character '" + (char) buffChar + "'.");
}
} else if (!isChar()) {
throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'.");
}
lexeme.append((char) buffChar);
nextChar();
}
lexeme.append((char) buffChar);
buffToken = new Token(
new Report.Location(startLocation, currentLocation()),
Token.Symbol.STRINGCONST,
lexeme.toString()
);
nextChar();
}
private void identifier() {
Report.Location startLocation = currentLocation();
Report.Location endLocation = currentLocation();
StringBuilder lexeme = new StringBuilder();
while (isAlphaNumeric()) {
lexeme.append((char) buffChar);
endLocation = currentLocation();
nextChar();
}
Token.Symbol symbol = getReservedWordSymbol(lexeme.toString());
if (symbol == null) {
symbol = Token.Symbol.IDENTIFIER;
}
if (symbol == Token.Symbol.LINE) {
buffToken = new Token(
new Report.Location(startLocation, endLocation),
Token.Symbol.INTCONST,
Integer.toString(startLocation.begLine())
);
return;
}
buffToken = new Token(
new Report.Location(startLocation, endLocation),
symbol,
lexeme.toString()
);
}
/**
* Vrne trenutni leksikalni simbol, ki ostane v lastnistvu leksikalnega
* analizatorja.
*
* @return Leksikalni simbol.
*/
public Token peekToken() {
if (buffToken == null)
nextToken();
return buffToken;
}
/**
* Vrne trenutni leksikalni simbol, ki preide v lastnistvo klicoce kode.
*
* @return Leksikalni simbol.
*/
public Token takeToken() {
if (buffToken == null)
nextToken();
final Token thisToken = buffToken;
buffToken = null;
return thisToken;
}
// --- ZAGON ---
/**
* Zagon leksikalnega analizatorja kot samostojnega programa.
*
* @param cmdLineArgs Argumenti v ukazni vrstici.
*/
public static void main(final String[] cmdLineArgs) {
System.out.println("This is PINS'25 compiler (lexical analysis):");
try {
if (cmdLineArgs.length == 0)
throw new Report.Error("No source file specified in the command line.");
if (cmdLineArgs.length > 1)
Report.warning("Unused arguments in the command line.");
try (LexAn lexAn = new LexAn(cmdLineArgs[0])) {
while (lexAn.peekToken().symbol() != Token.Symbol.EOF)
System.out.println(lexAn.takeToken());
System.out.println(lexAn.takeToken());
}
// Upajmo, da kdaj pridemo to te tocke.
// A zavedajmo se sledecega:
// 1. Prevod je zaradi napak v programu lahko napacen :-o
// 2. Izvorni program se zdalec ni tisto, kar je programer hotel, da bi bil ;-)
Report.info("Done.");
} catch (Report.Error error) {
// Izpis opisa napake.
System.err.println(error.getMessage());
System.exit(1);
}
}
}