530 lines
17 KiB
Java
530 lines
17 KiB
Java
package pins25.phase;
|
|
|
|
import java.io.*;
|
|
|
|
import pins25.common.*;
|
|
|
|
/**
|
|
* Leksikalni analizator.
|
|
*/
|
|
public class LexAn implements AutoCloseable {
|
|
|
|
/**
|
|
* Izvorna datoteka.
|
|
*/
|
|
private final Reader srcFile;
|
|
|
|
/**
|
|
* Ustvari nov leksikalni analizator.
|
|
*
|
|
* @param srcFileName Ime izvorne datoteke.
|
|
*/
|
|
public LexAn(final String srcFileName) {
|
|
try {
|
|
srcFile = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFileName))));
|
|
nextChar(); // Pripravi prvi znak izvorne datoteke (glej {@link nextChar}).
|
|
} catch (FileNotFoundException __) {
|
|
throw new Report.Error("Source file '" + srcFileName + "' not found.");
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void close() {
|
|
try {
|
|
srcFile.close();
|
|
} catch (IOException __) {
|
|
throw new Report.Error("Cannot close source file.");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Trenutni znak izvorne datoteke (glej {@link nextChar}).
|
|
*/
|
|
private int buffChar = -2;
|
|
|
|
/**
|
|
* Vrstica trenutnega znaka izvorne datoteke (glej {@link nextChar}).
|
|
*/
|
|
private int buffCharLine = 0;
|
|
|
|
/**
|
|
* Stolpec trenutnega znaka izvorne datoteke (glej {@link nextChar}).
|
|
*/
|
|
private int buffCharColumn = 0;
|
|
|
|
/**
|
|
* Prebere naslednji znak izvorne datoteke.
|
|
* <p>
|
|
* Izvorno datoteko beremo znak po znak. Trenutni znak izvorne datoteke je
|
|
* shranjen v spremenljivki {@link buffChar}, vrstica in stolpec trenutnega
|
|
* znaka izvorne datoteke sta shranjena v spremenljivkah {@link buffCharLine} in
|
|
* {@link buffCharColumn}.
|
|
* <p>
|
|
* Zacetne vrednosti {@link buffChar}, {@link buffCharLine} in
|
|
* {@link buffCharColumn} so {@code '\n'}, {@code 0} in {@code 0}: branje prvega
|
|
* znaka izvorne datoteke bo na osnovi vrednosti {@code '\n'} spremenljivke
|
|
* {@link buffChar} prvemu znaku izvorne datoteke priredilo vrstico 1 in stolpec
|
|
* 1.
|
|
* <p>
|
|
* Pri branju izvorne datoteke se predpostavlja, da je v spremenljivki
|
|
* {@link buffChar} ves "cas veljaven znak. Zunaj metode {@link nextChar} so vse
|
|
* spremenljivke {@link buffChar}, {@link buffCharLine} in
|
|
* {@link buffCharColumn} namenjene le branju.
|
|
* <p>
|
|
* Vrednost {@code -1} v spremenljivki {@link buffChar} pomeni konec datoteke
|
|
* (vrednosti spremenljivk {@link buffCharLine} in {@link buffCharColumn} pa
|
|
* nista ve"c veljavni).
|
|
*/
|
|
private void nextChar() {
|
|
try {
|
|
switch (buffChar) {
|
|
case -2: // Noben znak "se ni bil prebran.
|
|
buffChar = srcFile.read();
|
|
buffCharLine = buffChar == -1 ? 0 : 1;
|
|
buffCharColumn = buffChar == -1 ? 0 : 1;
|
|
return;
|
|
case -1: // Konec datoteke je bil ze viden.
|
|
return;
|
|
case '\n': // Prejsnji znak je koncal vrstico, zacne se nova vrstica.
|
|
buffChar = srcFile.read();
|
|
buffCharLine = buffChar == -1 ? buffCharLine : buffCharLine + 1;
|
|
buffCharColumn = buffChar == -1 ? buffCharColumn : 1;
|
|
return;
|
|
case '\t': // Prejsnji znak je tabulator, ta znak je morda potisnjen v desno.
|
|
buffChar = srcFile.read();
|
|
while (buffCharColumn % 4 != 0)
|
|
buffCharColumn += 1;
|
|
buffCharColumn += 1;
|
|
return;
|
|
default: // Prejsnji znak je brez posebnosti.
|
|
buffChar = srcFile.read();
|
|
buffCharColumn += 1;
|
|
}
|
|
} catch (IOException __) {
|
|
throw new Report.Error("Cannot read source file.");
|
|
}
|
|
}
|
|
|
|
private Report.Location currentLocation() {
|
|
return new Report.Location(buffCharLine, buffCharColumn);
|
|
}
|
|
|
|
/**
|
|
* Trenutni leksikalni simbol.
|
|
* <p>
|
|
* "Ce vrednost spremenljivke {@code buffToken} ni {@code null}, je simbol "ze
|
|
* prebran iz vhodne datoteke, ni pa "se predan naprej sintaksnemu analizatorju.
|
|
* Ta simbol je dostopen z metodama {@link peekToken} in {@link takeToken}.
|
|
*/
|
|
private Token buffToken = null;
|
|
|
|
/**
|
|
* Prebere naslednji leksikalni simbol, ki je nato dostopen preko metod
|
|
* {@link peekToken} in {@link takeToken}.
|
|
*/
|
|
private void nextToken() {
|
|
while (buffChar == ' ' || buffChar == '\n' || buffChar == '\t' || buffChar == '\r') {
|
|
nextChar();
|
|
}
|
|
|
|
Report.Location start = currentLocation();
|
|
switch (buffChar) {
|
|
case -1: // EOF
|
|
buffToken = new Token(start, Token.Symbol.EOF, "EOF");
|
|
return;
|
|
|
|
case '\'':
|
|
charConst();
|
|
return;
|
|
|
|
case '"':
|
|
stringConst();
|
|
return;
|
|
|
|
case '=':
|
|
nextChar();
|
|
if (buffChar == '=') {
|
|
buffToken = new Token(
|
|
new Report.Location(start, currentLocation()),
|
|
Token.Symbol.EQU,
|
|
"=="
|
|
);
|
|
nextChar();
|
|
return;
|
|
}
|
|
|
|
buffToken = new Token(start, Token.Symbol.ASSIGN, "=");
|
|
return;
|
|
|
|
case ',':
|
|
buffToken = new Token(currentLocation(), Token.Symbol.COMMA, ",");
|
|
nextChar();
|
|
return;
|
|
|
|
case '&':
|
|
nextChar();
|
|
if (buffChar != '&') {
|
|
throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'");
|
|
}
|
|
buffToken = new Token(new Report.Location(start, currentLocation()), Token.Symbol.AND, "&&");
|
|
nextChar();
|
|
return;
|
|
|
|
case '|':
|
|
nextChar();
|
|
if (buffChar != '|') {
|
|
throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'");
|
|
}
|
|
buffToken = new Token(new Report.Location(start, currentLocation()), Token.Symbol.OR, "||");
|
|
nextChar();
|
|
return;
|
|
|
|
case '!':
|
|
nextChar();
|
|
if (buffChar == '=') {
|
|
buffToken = new Token(
|
|
new Report.Location(start, currentLocation()),
|
|
Token.Symbol.NEQ,
|
|
"!="
|
|
);
|
|
nextChar();
|
|
return;
|
|
}
|
|
|
|
buffToken = new Token(start, Token.Symbol.NOT, "!");
|
|
return;
|
|
|
|
case '>':
|
|
nextChar();
|
|
if (buffChar == '=') {
|
|
buffToken = new Token(
|
|
new Report.Location(start, currentLocation()),
|
|
Token.Symbol.GEQ,
|
|
">="
|
|
);
|
|
nextChar();
|
|
return;
|
|
}
|
|
|
|
buffToken = new Token(start, Token.Symbol.GTH, ">");
|
|
return;
|
|
|
|
case '<':
|
|
nextChar();
|
|
if (buffChar == '=') {
|
|
buffToken = new Token(
|
|
new Report.Location(start, currentLocation()),
|
|
Token.Symbol.LEQ,
|
|
"<="
|
|
);
|
|
nextChar();
|
|
return;
|
|
}
|
|
|
|
buffToken = new Token(start, Token.Symbol.LTH, "<");
|
|
return;
|
|
|
|
case '+':
|
|
buffToken = new Token(currentLocation(), Token.Symbol.ADD, "+");
|
|
nextChar();
|
|
return;
|
|
|
|
case '-':
|
|
buffToken = new Token(currentLocation(), Token.Symbol.SUB, "-");
|
|
nextChar();
|
|
return;
|
|
|
|
case '*':
|
|
buffToken = new Token(currentLocation(), Token.Symbol.MUL, "*");
|
|
nextChar();
|
|
return;
|
|
|
|
case '/':
|
|
start = currentLocation();
|
|
nextChar();
|
|
if (buffChar != '/') {
|
|
buffToken = new Token(start, Token.Symbol.DIV, "/");
|
|
return;
|
|
}
|
|
|
|
while (buffChar != '\n' && buffChar != -1) {
|
|
nextChar();
|
|
}
|
|
|
|
nextToken();
|
|
return;
|
|
|
|
case '%':
|
|
buffToken = new Token(currentLocation(), Token.Symbol.MOD, "%");
|
|
nextChar();
|
|
return;
|
|
|
|
case '^':
|
|
buffToken = new Token(currentLocation(), Token.Symbol.PTR, "^");
|
|
nextChar();
|
|
return;
|
|
|
|
case '(':
|
|
buffToken = new Token(currentLocation(), Token.Symbol.LPAREN, "(");
|
|
nextChar();
|
|
return;
|
|
|
|
case ')':
|
|
buffToken = new Token(currentLocation(), Token.Symbol.RPAREN, ")");
|
|
nextChar();
|
|
return;
|
|
}
|
|
|
|
if (isNumeric()) {
|
|
intConst();
|
|
return;
|
|
}
|
|
if (isAlpha()) {
|
|
identifier();
|
|
return;
|
|
}
|
|
|
|
throw new Report.Error(currentLocation(), "Unrecognized character '" + (char) buffChar + "'.");
|
|
}
|
|
|
|
private boolean isNumeric() {
|
|
return buffChar >= '0' && buffChar <= '9';
|
|
}
|
|
|
|
private boolean isChar() {
|
|
return buffChar >= ' ' && buffChar <= '~';
|
|
}
|
|
|
|
private boolean isHex() {
|
|
return buffChar >= '0' && buffChar <= '9' || buffChar >= 'a' && buffChar <= 'f';
|
|
}
|
|
|
|
private boolean isAlpha() {
|
|
return buffChar >= 'a' && buffChar <= 'z' || buffChar >= 'A' && buffChar <= 'Z' || buffChar == '_';
|
|
}
|
|
|
|
private boolean isAlphaNumeric() {
|
|
return buffChar >= 'a' && buffChar <= 'z' || buffChar >= 'A' && buffChar <= 'Z' || buffChar >= '0' && buffChar <= '9' || buffChar == '_';
|
|
}
|
|
|
|
private Token.Symbol getReservedWordSymbol(String word) {
|
|
return switch (word) {
|
|
case "fun" -> Token.Symbol.FUN;
|
|
case "var" -> Token.Symbol.VAR;
|
|
case "if" -> Token.Symbol.IF;
|
|
case "then" -> Token.Symbol.THEN;
|
|
case "else" -> Token.Symbol.ELSE;
|
|
case "while" -> Token.Symbol.WHILE;
|
|
case "do" -> Token.Symbol.DO;
|
|
case "let" -> Token.Symbol.LET;
|
|
case "in" -> Token.Symbol.IN;
|
|
case "end" -> Token.Symbol.END;
|
|
case "__LINE__" -> Token.Symbol.LINE;
|
|
default -> null;
|
|
};
|
|
}
|
|
|
|
private void intConst() {
|
|
Report.Location startLocation = currentLocation();
|
|
Report.Location endLocation = currentLocation();
|
|
StringBuilder lexeme = new StringBuilder();
|
|
|
|
if (buffChar == '0') {
|
|
lexeme.append((char) buffChar);
|
|
nextChar();
|
|
if (isNumeric()) {
|
|
throw new Report.Error(startLocation, "Leading zero is not allowed.");
|
|
}
|
|
} else {
|
|
while (isNumeric()) {
|
|
lexeme.append((char) buffChar);
|
|
endLocation = currentLocation();
|
|
nextChar();
|
|
}
|
|
}
|
|
|
|
buffToken = new Token(
|
|
new Report.Location(startLocation, endLocation),
|
|
Token.Symbol.INTCONST,
|
|
lexeme.toString()
|
|
);
|
|
}
|
|
|
|
private void charConst() {
|
|
Report.Location startLocation = currentLocation();
|
|
StringBuilder lexeme = new StringBuilder();
|
|
|
|
lexeme.append((char) buffChar);
|
|
nextChar();
|
|
|
|
if (!isChar()) {
|
|
throw new Report.Error(startLocation, "Invalid character '" + (char) buffChar + "'.");
|
|
}
|
|
|
|
lexeme.append((char) buffChar);
|
|
if (buffChar == '\\') {
|
|
nextChar();
|
|
lexeme.append((char) buffChar);
|
|
if (buffChar == 'n' || buffChar == '\\' || buffChar == '\'') {
|
|
|
|
} else if (isHex()) {
|
|
nextChar();
|
|
lexeme.append((char) buffChar);
|
|
if (!isHex()) {
|
|
throw new Report.Error(currentLocation(), "Invalid ascii code '" + (char) buffChar + "'.");
|
|
}
|
|
} else {
|
|
throw new Report.Error(currentLocation(), "Invalid escaped character '" + (char) buffChar + "'.");
|
|
}
|
|
}
|
|
|
|
nextChar();
|
|
if (buffChar != '\'') {
|
|
throw new Report.Error(new Report.Location(startLocation, currentLocation()), "Invalid character '" + (char) buffChar + "'.");
|
|
}
|
|
lexeme.append((char) buffChar);
|
|
|
|
buffToken = new Token(
|
|
new Report.Location(startLocation, currentLocation()),
|
|
Token.Symbol.CHARCONST,
|
|
lexeme.toString()
|
|
);
|
|
|
|
nextChar();
|
|
}
|
|
|
|
private void stringConst() {
|
|
Report.Location startLocation = currentLocation();
|
|
StringBuilder lexeme = new StringBuilder();
|
|
|
|
lexeme.append((char) buffChar);
|
|
nextChar();
|
|
|
|
while (buffChar != '"') {
|
|
if (buffChar == '\n' || buffChar == -1) {
|
|
throw new Report.Error(currentLocation(), "Unterminated string.");
|
|
}
|
|
|
|
if (buffChar == '\\') {
|
|
lexeme.append((char) buffChar);
|
|
nextChar();
|
|
if (buffChar == 'n' || buffChar == '\\' || buffChar == '"') {
|
|
} else if (isHex()) {
|
|
lexeme.append((char) buffChar);
|
|
nextChar();
|
|
if (!isHex()) {
|
|
throw new Report.Error(currentLocation(), "Invalid ascii code '" + (char) buffChar + "'.");
|
|
}
|
|
} else {
|
|
throw new Report.Error(currentLocation(), "Invalid escaped character '" + (char) buffChar + "'.");
|
|
}
|
|
} else if (!isChar()) {
|
|
throw new Report.Error(currentLocation(), "Invalid character '" + (char) buffChar + "'.");
|
|
}
|
|
|
|
lexeme.append((char) buffChar);
|
|
nextChar();
|
|
}
|
|
|
|
lexeme.append((char) buffChar);
|
|
buffToken = new Token(
|
|
new Report.Location(startLocation, currentLocation()),
|
|
Token.Symbol.STRINGCONST,
|
|
lexeme.toString()
|
|
);
|
|
|
|
nextChar();
|
|
}
|
|
|
|
private void identifier() {
|
|
Report.Location startLocation = currentLocation();
|
|
Report.Location endLocation = currentLocation();
|
|
StringBuilder lexeme = new StringBuilder();
|
|
|
|
while (isAlphaNumeric()) {
|
|
lexeme.append((char) buffChar);
|
|
endLocation = currentLocation();
|
|
nextChar();
|
|
}
|
|
|
|
Token.Symbol symbol = getReservedWordSymbol(lexeme.toString());
|
|
if (symbol == null) {
|
|
symbol = Token.Symbol.IDENTIFIER;
|
|
}
|
|
|
|
if (symbol == Token.Symbol.LINE) {
|
|
buffToken = new Token(
|
|
new Report.Location(startLocation, endLocation),
|
|
Token.Symbol.INTCONST,
|
|
Integer.toString(startLocation.begLine())
|
|
);
|
|
return;
|
|
}
|
|
|
|
buffToken = new Token(
|
|
new Report.Location(startLocation, endLocation),
|
|
symbol,
|
|
lexeme.toString()
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Vrne trenutni leksikalni simbol, ki ostane v lastnistvu leksikalnega
|
|
* analizatorja.
|
|
*
|
|
* @return Leksikalni simbol.
|
|
*/
|
|
public Token peekToken() {
|
|
if (buffToken == null)
|
|
nextToken();
|
|
return buffToken;
|
|
}
|
|
|
|
/**
|
|
* Vrne trenutni leksikalni simbol, ki preide v lastnistvo klicoce kode.
|
|
*
|
|
* @return Leksikalni simbol.
|
|
*/
|
|
public Token takeToken() {
|
|
if (buffToken == null)
|
|
nextToken();
|
|
final Token thisToken = buffToken;
|
|
buffToken = null;
|
|
return thisToken;
|
|
}
|
|
|
|
// --- ZAGON ---
|
|
|
|
/**
|
|
* Zagon leksikalnega analizatorja kot samostojnega programa.
|
|
*
|
|
* @param cmdLineArgs Argumenti v ukazni vrstici.
|
|
*/
|
|
public static void main(final String[] cmdLineArgs) {
|
|
System.out.println("This is PINS'25 compiler (lexical analysis):");
|
|
|
|
try {
|
|
if (cmdLineArgs.length == 0)
|
|
throw new Report.Error("No source file specified in the command line.");
|
|
if (cmdLineArgs.length > 1)
|
|
Report.warning("Unused arguments in the command line.");
|
|
|
|
try (LexAn lexAn = new LexAn(cmdLineArgs[0])) {
|
|
while (lexAn.peekToken().symbol() != Token.Symbol.EOF)
|
|
System.out.println(lexAn.takeToken());
|
|
System.out.println(lexAn.takeToken());
|
|
}
|
|
|
|
// Upajmo, da kdaj pridemo to te tocke.
|
|
// A zavedajmo se sledecega:
|
|
// 1. Prevod je zaradi napak v programu lahko napacen :-o
|
|
// 2. Izvorni program se zdalec ni tisto, kar je programer hotel, da bi bil ;-)
|
|
Report.info("Done.");
|
|
} catch (Report.Error error) {
|
|
// Izpis opisa napake.
|
|
System.err.println(error.getMessage());
|
|
System.exit(1);
|
|
}
|
|
}
|
|
|
|
} |