0% found this document useful (0 votes)
3 views4 pages

a

The Lexer class is designed to tokenize an input string into various token types such as digits, identifiers, and symbols. It processes the input character by character, identifying and categorizing each token while handling special cases like function calls and multi-character operators. The class also includes methods for printing the generated tokens and managing errors during tokenization.

Uploaded by

txenet
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views4 pages

a

The Lexer class is designed to tokenize an input string into various token types such as digits, identifiers, and symbols. It processes the input character by character, identifying and categorizing each token while handling special cases like function calls and multi-character operators. The class also includes methods for printing the generated tokens and managing errors during tokenization.

Uploaded by

txenet
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

package com.egenoves.

eva;

import java.util.ArrayList;

import static com.egenoves.eva.TokenType.*;

public class Lexer {


int currentPos;
int startPos;
boolean error = false;
final String input;
final ArrayList<Token> tokens = new ArrayList<>();

/**
* Constructor
* Pass the string to be tokenized in the constructor
* and get the tokens with the method getTokens()
* @param input String to be tokenized
*/
public Lexer(String input) {
this.input = input;
this.currentPos = 0;
this.startPos = 0;
tokenize();
}

/**
* Tokenize the input string
* Three cases: is a digit, is an identifier or is a symbol
* a) A digit is 0 - 9 and DOT character
* b) A identifier is variable name comprised of letters
* c) A symbol is a character that is not a digit or a letter
* like operators, parenthesis, etc
* @return ArrayList<Token>
*/
private void tokenize() {
while(currentPos<input.length()) {

char c = input.charAt(currentPos);
if(Character.isSpaceChar(c)) {currentPos++; continue;}

Token token;
if (Character.isDigit(c)){
token = getNextDigit();
} else if (Character.isLetter(c)) {
token = getNextIdentifier();
} else{
token = getNextSymbol();
}

this.tokens.add(token);
}
}

/**
* Get the next number, can be integer or float if there is a DOT
* @return Token
*/
private Token getNextDigit() {
startPos = currentPos;
boolean floatNumber = false;

while(Character.isDigit(input.charAt(currentPos)) ||
input.charAt(currentPos) == '.'){
if(input.charAt(currentPos) == '.' ) floatNumber = true;
currentPos++;
if(currentPos==input.length()) break;
}

if(floatNumber) return new Token(TOKEN_FLOAT, input.substring(startPos,


currentPos), startPos);

return new Token(TOKEN_INTEGER, input.substring(startPos, currentPos),


startPos) ;
}

/**
* Get the next identifier, a variable name
* There is a special case for function/method call, in that case all the
remaining
* chars are parsed between parenthesis returning the whole call,
* i.e. the lexer returns a token
* like TOKEN_CALL callFunction(a,b)
* @return Token
*/
private Token getNextIdentifier() {
startPos = currentPos;
while(Character.isLetter(input.charAt(currentPos))){
if(currentPos+1==input.length()) {
currentPos++;
return new Token(TOKEN_IDENTIFIER, input.substring(startPos,
currentPos), startPos);
}
currentPos++;
}
//especial case for function/method call
int numParenthesis = 0;
if(input.charAt(currentPos)=='('){
numParenthesis++;
while(input.charAt(currentPos)!=')' && numParenthesis>0){
if(input.charAt(currentPos)==')') numParenthesis--;
if(input.charAt(currentPos)=='(') numParenthesis++;
if(currentPos+1==input.length()) break;
currentPos++;
}
currentPos++;
return new Token(TOKEN_CALL, input.substring(startPos, currentPos),
startPos);
}
return new Token(TOKEN_IDENTIFIER, input.substring(startPos, currentPos),
startPos);
}

/**
* Get the next symbol, like operators, parenthesis, etc.
* There are special cases for operators that can be composed of two characters
* which are usually the comparison operators
*
* There is a special case for expression reference, which is a number preceded
by %
* meaning the expression %n is going to be replaced by the n-th expression
that appears
* in the context. ie: print(%7) will print the 7-th expression where %7 == 2+3
*
* @return Token
*/
private Token getNextSymbol() {
char c = input.charAt(currentPos);
Token token = null;
switch(c) {
case '+' -> token = new Token(TOKEN_ADDITION, "+", currentPos);
case '-' -> token = new Token(TOKEN_SUBTRACTION, "-", currentPos);
case '*' -> token = new Token(TOKEN_MULTIPLY, "*", currentPos);
case '/' -> token = new Token(TOKEN_DIVISION, "/", currentPos);
case '(' -> token = new Token(TOKEN_LEFT_PARENTHESIS, "(", currentPos);
case ')' -> token = new Token(TOKEN_RIGHT_PARENTHESIS, ")",
currentPos);
case '{' -> token = new Token(TOKEN_LEFT_BRACE, "{", currentPos);
case '}' -> token = new Token(TOKEN_RIGHT_BRACE, "}", currentPos);
case ',' -> token = new Token(TOKEN_COMMA, ",", currentPos);
case '^' -> token = new Token(TOKEN_POWER, "^", currentPos);
case '='->{
//Next token can be ==, =<, =>,
switch (lookAhead()){
case '=' -> {token = new Token(TOKEN_EQUAL_EQUAL, "==",
currentPos); currentPos++;}
case '<' -> {token = new Token(TOKEn_EQUAL_MINOR, "=<",
currentPos); currentPos++;}
case '>' -> {token = new Token(TOKEN_EQUAL_MAJOR, "=>",
currentPos); currentPos++;}
default -> token = new Token(TOKEN_EQUAL, "=", currentPos);
}
}
case '<'->{
//Next token can be ==, =<, =>,
switch (lookAhead()){
case '=' -> {token = new Token(TOKEn_LESS_EQUAL, "<=",
currentPos); currentPos++;}
case '>' -> {token = new Token(TOKEN_NOT_EQUAL, "<>",
currentPos); currentPos++;}
default -> token = new Token(TOKEN_MINOR, "<", currentPos);
}
}
case '>'->{
//Next token can be ==, =<, =>,
if (lookAhead()=='='){
token = new Token(TOKEN_GREATER_EQUAl, "<=", currentPos);
currentPos++;
} else token = new Token(TOKEN_GREATER, ">", currentPos);
}
case '%'->{
boolean isExprReference = false;

while(Character.isDigit(lookAhead()) || lookAhead()=='.'){
isExprReference = true;
currentPos++;
}
if(isExprReference) token = new Token(TOKEN_EXPRESSION_REFERENCE,
input.substring(startPos, currentPos+1), startPos);
}
default -> {
this.error = true;
currentPos++;
return new Token(TOKEN_EXPRESSION_ERROR, "Error, invalid character
" +c, currentPos);
}
}
currentPos++;
return token;
}

/**
* Helper method to look ahead in the input string
* @return char
*/
private char lookAhead(){
if(currentPos+1==input.length()) return 0;
return input.charAt(currentPos+1);
}

/**
* Helper method to print the tokens tokenized
* @return ArrayList<Token>
*/
public void printTokens(){
for (Token token : tokens) {
System.out.println(token);
}
}

You might also like