ParseStateMachine.java
/*
* CSVeed (https://github.com/42BV/CSVeed)
*
* Copyright 2013-2023 CSVeed.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of The Apache Software License,
* Version 2.0 which accompanies this distribution, and is available at
* https://www.apache.org/licenses/LICENSE-2.0.txt
*/
package org.csveed.token;
import static org.csveed.token.ParseState.COMMENT_LINE;
import static org.csveed.token.ParseState.COMMENT_LINE_FINISHED;
import static org.csveed.token.ParseState.ESCAPING;
import static org.csveed.token.ParseState.FINISHED;
import static org.csveed.token.ParseState.FIRST_CHAR_INSIDE_QUOTED_FIELD;
import static org.csveed.token.ParseState.INSIDE_FIELD;
import static org.csveed.token.ParseState.INSIDE_QUOTED_FIELD;
import static org.csveed.token.ParseState.LINE_FINISHED;
import static org.csveed.token.ParseState.OUTSIDE_AFTER_FIELD;
import static org.csveed.token.ParseState.OUTSIDE_BEFORE_FIELD;
import static org.csveed.token.ParseState.SEPARATOR;
import static org.csveed.token.ParseState.SKIP_LINE;
import static org.csveed.token.ParseState.SKIP_LINE_FINISHED;
import static org.csveed.token.ParseState.START_OF_LINE;
import org.csveed.common.Column;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Yep, a state machine. Managing all kinds of booleans to form a pseudo-state doesn't work really well whereas a state
* machine does. The state machine takes one character at a time, checks routes to the new state if necessary and holds
* tokens, which it returns whenever a field-end ('popToken') has been found.
*/
public class ParseStateMachine {
/** The Constant LOG. */
private static final Logger LOG = LoggerFactory.getLogger(ParseStateMachine.class);
/** The state. */
private ParseState state = START_OF_LINE;
/** The token. */
private StringBuilder token = new StringBuilder();
/** The characters read. */
private int charactersRead;
/** The symbol mapping. */
private SymbolMapping symbolMapping = new SymbolMapping();
/** The token state. */
private TokenState tokenState = TokenState.RESET;
/** The trim. */
private boolean trim = true;
/** The trash. */
private boolean trash;
/** The current column. */
private Column currentColumn = new Column();
/** The current line. */
private int currentLine = 1;
/** The new line. */
private int newLine = currentLine;
/**
* Gets the current line.
*
* @return the current line
*/
public int getCurrentLine() {
return this.currentLine;
}
/**
* Gets the current column.
*
* @return the current column
*/
public int getCurrentColumn() {
return this.currentColumn.getColumnIndex();
}
/**
* Checks if is trash.
*
* @return true, if is trash
*/
public boolean isTrash() {
return this.trash;
}
/**
* Offer symbol.
*
* @param symbolCharacter
* the symbol character
*
* @return the string
*
* @throws ParseException
* the parse exception
*/
public String offerSymbol(int symbolCharacter) throws ParseException {
this.trash = false;
EncounteredSymbol symbol = symbolMapping.find(symbolCharacter, state);
if (symbol.isTrash()) {
this.trash = true;
return null;
}
if (isFinished()) {
throw new ParseException(state, symbolCharacter, symbol);
}
if (currentLine != newLine) {
state = START_OF_LINE;
charactersRead = 0;
currentColumn = currentColumn.nextLine();
currentLine = newLine;
}
if (currentLine < symbolMapping.getStartLine()) {
state = SKIP_LINE;
}
if (tokenState.isStart()) {
tokenState = tokenState.next();
}
ParseState newState = determineState(symbolCharacter, symbol);
LOG.debug("{} ({}): {} => {}", (char) symbolCharacter, symbol, state, newState);
if (newState.isTokenize()) {
if (tokenState.isReset()) {
trim = newState.trim();
tokenState = tokenState.next();
}
token.append((char) symbolCharacter);
}
String returnToken = null;
if (newState.isPopToken()) {
returnToken = token.toString();
if (trim) {
returnToken = returnToken.trim();
}
token = new StringBuilder();
tokenState = tokenState.next();
currentColumn = currentColumn.nextColumn();
}
if (newState.isLineFinished()) {
newLine++;
} else {
charactersRead++;
}
state = newState;
return returnToken;
}
/**
* Checks if is token start.
*
* @return true, if is token start
*/
public boolean isTokenStart() {
return tokenState.isStart();
}
/**
* Checks if is line finished.
*
* @return true, if is line finished
*/
public boolean isLineFinished() {
return state.isLineFinished();
}
/**
* Checks if is finished.
*
* @return true, if is finished
*/
public boolean isFinished() {
return state == FINISHED;
}
/**
* Ignore line.
*
* @return true, if successful
*/
public boolean ignoreLine() {
return state.isIgnore() || isEmptyLine();
}
/**
* Checks if is empty line.
*
* @return true, if is empty line
*/
public boolean isEmptyLine() {
return charactersRead == 0;
}
/**
* Determine state.
*
* @param symbolCharacter
* the symbol character
* @param symbol
* the symbol
*
* @return the parses the state
*
* @throws ParseException
* the parse exception
*/
protected ParseState determineState(int symbolCharacter, EncounteredSymbol symbol) throws ParseException {
switch (state) {
case SKIP_LINE:
switch (symbol) {
case EOL_SYMBOL:
return SKIP_LINE_FINISHED;
case END_OF_FILE_SYMBOL:
return FINISHED;
default:
return SKIP_LINE;
}
case COMMENT_LINE:
switch (symbol) {
case EOL_SYMBOL:
return COMMENT_LINE_FINISHED;
case END_OF_FILE_SYMBOL:
return FINISHED;
default:
return COMMENT_LINE;
}
case START_OF_LINE:
if (EncounteredSymbol.COMMENT_SYMBOL.equals(symbol) && symbolMapping.isSkipCommentLines()) {
return COMMENT_LINE;
}
//$FALL-THROUGH$
case SEPARATOR:
switch (symbol) {
case SPACE_SYMBOL:
return OUTSIDE_BEFORE_FIELD;
case QUOTE_SYMBOL:
return FIRST_CHAR_INSIDE_QUOTED_FIELD;
case SEPARATOR_SYMBOL:
return SEPARATOR;
case END_OF_FILE_SYMBOL:
return FINISHED;
case EOL_SYMBOL:
return LINE_FINISHED;
default:
return INSIDE_FIELD;
}
case OUTSIDE_BEFORE_FIELD:
switch (symbol) {
case SPACE_SYMBOL:
return OUTSIDE_BEFORE_FIELD;
case SEPARATOR_SYMBOL:
return SEPARATOR;
case END_OF_FILE_SYMBOL:
return FINISHED;
case EOL_SYMBOL:
return LINE_FINISHED;
case QUOTE_SYMBOL:
return FIRST_CHAR_INSIDE_QUOTED_FIELD;
default:
return INSIDE_FIELD;
}
case OUTSIDE_AFTER_FIELD:
switch (symbol) {
case SPACE_SYMBOL:
return OUTSIDE_AFTER_FIELD;
case SEPARATOR_SYMBOL:
return SEPARATOR;
case END_OF_FILE_SYMBOL:
return FINISHED;
case EOL_SYMBOL:
return LINE_FINISHED;
default:
throw new ParseException(state, symbolCharacter, symbol);
}
case INSIDE_FIELD:
switch (symbol) {
case SEPARATOR_SYMBOL:
return SEPARATOR;
case END_OF_FILE_SYMBOL:
return FINISHED;
case EOL_SYMBOL:
return LINE_FINISHED;
case QUOTE_SYMBOL:
throw new ParseException(state, symbolCharacter, symbol);
default:
return INSIDE_FIELD;
}
case FIRST_CHAR_INSIDE_QUOTED_FIELD:
case INSIDE_QUOTED_FIELD:
switch (symbol) {
case QUOTE_SYMBOL:
return OUTSIDE_AFTER_FIELD;
case ESCAPE_SYMBOL:
return ESCAPING;
case END_OF_FILE_SYMBOL:
throw new ParseException(state, symbolCharacter, symbol);
default:
return INSIDE_QUOTED_FIELD;
}
case ESCAPING:
if (symbolMapping.isSameCharactersForEscapeAndQuote()) { // This is the default
switch (symbol) {
case SPACE_SYMBOL:
return OUTSIDE_AFTER_FIELD;
case QUOTE_SYMBOL:
return INSIDE_QUOTED_FIELD;
case EOL_SYMBOL: // Needed when quote/escape are the same: ...abc"\n
return LINE_FINISHED;
case SEPARATOR_SYMBOL: // Needed when quote/escape are the same: ...abc";
return SEPARATOR;
case END_OF_FILE_SYMBOL:
return FINISHED;
default:
throw new ParseException(state, symbolCharacter, symbol);
}
}
// We're lenient -- accept everything
return INSIDE_QUOTED_FIELD;
default:
throw new ParseException(state, symbolCharacter, symbol);
}
}
/**
* Sets the symbol mapping.
*
* @param symbolMapping
* the new symbol mapping
*/
public void setSymbolMapping(SymbolMapping symbolMapping) {
this.symbolMapping = symbolMapping;
}
/**
* Gets the symbol mapping.
*
* @return the symbol mapping
*/
public SymbolMapping getSymbolMapping() {
return this.symbolMapping;
}
}