View Javadoc
1   /*
2    * CSVeed (https://github.com/42BV/CSVeed)
3    *
4    * Copyright 2013-2023 CSVeed.
5    *
6    * All rights reserved. This program and the accompanying materials
7    * are made available under the terms of The Apache Software License,
8    * Version 2.0 which accompanies this distribution, and is available at
9    * https://www.apache.org/licenses/LICENSE-2.0.txt
10   */
11  package org.csveed.token;
12  
13  import static org.csveed.token.ParseState.COMMENT_LINE;
14  import static org.csveed.token.ParseState.COMMENT_LINE_FINISHED;
15  import static org.csveed.token.ParseState.ESCAPING;
16  import static org.csveed.token.ParseState.FINISHED;
17  import static org.csveed.token.ParseState.FIRST_CHAR_INSIDE_QUOTED_FIELD;
18  import static org.csveed.token.ParseState.INSIDE_FIELD;
19  import static org.csveed.token.ParseState.INSIDE_QUOTED_FIELD;
20  import static org.csveed.token.ParseState.LINE_FINISHED;
21  import static org.csveed.token.ParseState.OUTSIDE_AFTER_FIELD;
22  import static org.csveed.token.ParseState.OUTSIDE_BEFORE_FIELD;
23  import static org.csveed.token.ParseState.SEPARATOR;
24  import static org.csveed.token.ParseState.SKIP_LINE;
25  import static org.csveed.token.ParseState.SKIP_LINE_FINISHED;
26  import static org.csveed.token.ParseState.START_OF_LINE;
27  
28  import org.csveed.common.Column;
29  import org.slf4j.Logger;
30  import org.slf4j.LoggerFactory;
31  
32  /**
33   * Yep, a state machine. Managing all kinds of booleans to form a pseudo-state doesn't work really well whereas a state
34   * machine does. The state machine takes one character at a time, checks routes to the new state if necessary and holds
35   * tokens, which it returns whenever a field-end ('popToken') has been found.
36   */
37  public class ParseStateMachine {
38  
39      /** The Constant LOG. */
40      private static final Logger LOG = LoggerFactory.getLogger(ParseStateMachine.class);
41  
42      /** The state. */
43      private ParseState state = START_OF_LINE;
44  
45      /** The token. */
46      private StringBuilder token = new StringBuilder();
47  
48      /** The characters read. */
49      private int charactersRead;
50  
51      /** The symbol mapping. */
52      private SymbolMapping symbolMapping = new SymbolMapping();
53  
54      /** The token state. */
55      private TokenState tokenState = TokenState.RESET;
56  
57      /** The trim. */
58      private boolean trim = true;
59  
60      /** The trash. */
61      private boolean trash;
62  
63      /** The current column. */
64      private Column currentColumn = new Column();
65  
66      /** The current line. */
67      private int currentLine = 1;
68  
69      /** The new line. */
70      private int newLine = currentLine;
71  
72      /**
73       * Gets the current line.
74       *
75       * @return the current line
76       */
77      public int getCurrentLine() {
78          return this.currentLine;
79      }
80  
81      /**
82       * Gets the current column.
83       *
84       * @return the current column
85       */
86      public int getCurrentColumn() {
87          return this.currentColumn.getColumnIndex();
88      }
89  
90      /**
91       * Checks if is trash.
92       *
93       * @return true, if is trash
94       */
95      public boolean isTrash() {
96          return this.trash;
97      }
98  
99      /**
100      * Offer symbol.
101      *
102      * @param symbolCharacter
103      *            the symbol character
104      *
105      * @return the string
106      *
107      * @throws ParseException
108      *             the parse exception
109      */
110     public String offerSymbol(int symbolCharacter) throws ParseException {
111 
112         this.trash = false;
113 
114         EncounteredSymbol symbol = symbolMapping.find(symbolCharacter, state);
115 
116         if (symbol.isTrash()) {
117             this.trash = true;
118             return null;
119         }
120 
121         if (isFinished()) {
122             throw new ParseException(state, symbolCharacter, symbol);
123         }
124 
125         if (currentLine != newLine) {
126             state = START_OF_LINE;
127             charactersRead = 0;
128             currentColumn = currentColumn.nextLine();
129             currentLine = newLine;
130         }
131 
132         if (currentLine < symbolMapping.getStartLine()) {
133             state = SKIP_LINE;
134         }
135 
136         if (tokenState.isStart()) {
137             tokenState = tokenState.next();
138         }
139 
140         ParseState newState = determineState(symbolCharacter, symbol);
141         LOG.debug("{} ({}): {} => {}", (char) symbolCharacter, symbol, state, newState);
142 
143         if (newState.isTokenize()) {
144             if (tokenState.isReset()) {
145                 trim = newState.trim();
146                 tokenState = tokenState.next();
147             }
148             token.append((char) symbolCharacter);
149         }
150         String returnToken = null;
151 
152         if (newState.isPopToken()) {
153             returnToken = token.toString();
154             if (trim) {
155                 returnToken = returnToken.trim();
156             }
157             token = new StringBuilder();
158             tokenState = tokenState.next();
159             currentColumn = currentColumn.nextColumn();
160         }
161 
162         if (newState.isLineFinished()) {
163             newLine++;
164         } else {
165             charactersRead++;
166         }
167 
168         state = newState;
169 
170         return returnToken;
171     }
172 
173     /**
174      * Checks if is token start.
175      *
176      * @return true, if is token start
177      */
178     public boolean isTokenStart() {
179         return tokenState.isStart();
180     }
181 
182     /**
183      * Checks if is line finished.
184      *
185      * @return true, if is line finished
186      */
187     public boolean isLineFinished() {
188         return state.isLineFinished();
189     }
190 
191     /**
192      * Checks if is finished.
193      *
194      * @return true, if is finished
195      */
196     public boolean isFinished() {
197         return state == FINISHED;
198     }
199 
200     /**
201      * Ignore line.
202      *
203      * @return true, if successful
204      */
205     public boolean ignoreLine() {
206         return state.isIgnore() || isEmptyLine();
207     }
208 
209     /**
210      * Checks if is empty line.
211      *
212      * @return true, if is empty line
213      */
214     public boolean isEmptyLine() {
215         return charactersRead == 0;
216     }
217 
218     /**
219      * Determine state.
220      *
221      * @param symbolCharacter
222      *            the symbol character
223      * @param symbol
224      *            the symbol
225      *
226      * @return the parses the state
227      *
228      * @throws ParseException
229      *             the parse exception
230      */
231     protected ParseState determineState(int symbolCharacter, EncounteredSymbol symbol) throws ParseException {
232 
233         switch (state) {
234             case SKIP_LINE:
235                 switch (symbol) {
236                     case EOL_SYMBOL:
237                         return SKIP_LINE_FINISHED;
238                     case END_OF_FILE_SYMBOL:
239                         return FINISHED;
240                     default:
241                         return SKIP_LINE;
242                 }
243             case COMMENT_LINE:
244                 switch (symbol) {
245                     case EOL_SYMBOL:
246                         return COMMENT_LINE_FINISHED;
247                     case END_OF_FILE_SYMBOL:
248                         return FINISHED;
249                     default:
250                         return COMMENT_LINE;
251                 }
252             case START_OF_LINE:
253                 if (EncounteredSymbol.COMMENT_SYMBOL.equals(symbol) && symbolMapping.isSkipCommentLines()) {
254                     return COMMENT_LINE;
255                 }
256                 //$FALL-THROUGH$
257             case SEPARATOR:
258                 switch (symbol) {
259                     case SPACE_SYMBOL:
260                         return OUTSIDE_BEFORE_FIELD;
261                     case QUOTE_SYMBOL:
262                         return FIRST_CHAR_INSIDE_QUOTED_FIELD;
263                     case SEPARATOR_SYMBOL:
264                         return SEPARATOR;
265                     case END_OF_FILE_SYMBOL:
266                         return FINISHED;
267                     case EOL_SYMBOL:
268                         return LINE_FINISHED;
269                     default:
270                         return INSIDE_FIELD;
271                 }
272             case OUTSIDE_BEFORE_FIELD:
273                 switch (symbol) {
274                     case SPACE_SYMBOL:
275                         return OUTSIDE_BEFORE_FIELD;
276                     case SEPARATOR_SYMBOL:
277                         return SEPARATOR;
278                     case END_OF_FILE_SYMBOL:
279                         return FINISHED;
280                     case EOL_SYMBOL:
281                         return LINE_FINISHED;
282                     case QUOTE_SYMBOL:
283                         return FIRST_CHAR_INSIDE_QUOTED_FIELD;
284                     default:
285                         return INSIDE_FIELD;
286                 }
287             case OUTSIDE_AFTER_FIELD:
288                 switch (symbol) {
289                     case SPACE_SYMBOL:
290                         return OUTSIDE_AFTER_FIELD;
291                     case SEPARATOR_SYMBOL:
292                         return SEPARATOR;
293                     case END_OF_FILE_SYMBOL:
294                         return FINISHED;
295                     case EOL_SYMBOL:
296                         return LINE_FINISHED;
297                     default:
298                         throw new ParseException(state, symbolCharacter, symbol);
299                 }
300             case INSIDE_FIELD:
301                 switch (symbol) {
302                     case SEPARATOR_SYMBOL:
303                         return SEPARATOR;
304                     case END_OF_FILE_SYMBOL:
305                         return FINISHED;
306                     case EOL_SYMBOL:
307                         return LINE_FINISHED;
308                     case QUOTE_SYMBOL:
309                         throw new ParseException(state, symbolCharacter, symbol);
310                     default:
311                         return INSIDE_FIELD;
312                 }
313             case FIRST_CHAR_INSIDE_QUOTED_FIELD:
314             case INSIDE_QUOTED_FIELD:
315                 switch (symbol) {
316                     case QUOTE_SYMBOL:
317                         return OUTSIDE_AFTER_FIELD;
318                     case ESCAPE_SYMBOL:
319                         return ESCAPING;
320                     case END_OF_FILE_SYMBOL:
321                         throw new ParseException(state, symbolCharacter, symbol);
322                     default:
323                         return INSIDE_QUOTED_FIELD;
324                 }
325             case ESCAPING:
326                 if (symbolMapping.isSameCharactersForEscapeAndQuote()) { // This is the default
327                     switch (symbol) {
328                         case SPACE_SYMBOL:
329                             return OUTSIDE_AFTER_FIELD;
330                         case QUOTE_SYMBOL:
331                             return INSIDE_QUOTED_FIELD;
332                         case EOL_SYMBOL: // Needed when quote/escape are the same: ...abc"\n
333                             return LINE_FINISHED;
334                         case SEPARATOR_SYMBOL: // Needed when quote/escape are the same: ...abc";
335                             return SEPARATOR;
336                         case END_OF_FILE_SYMBOL:
337                             return FINISHED;
338                         default:
339                             throw new ParseException(state, symbolCharacter, symbol);
340                     }
341                 }
342                 // We're lenient -- accept everything
343                 return INSIDE_QUOTED_FIELD;
344             default:
345                 throw new ParseException(state, symbolCharacter, symbol);
346         }
347     }
348 
349     /**
350      * Sets the symbol mapping.
351      *
352      * @param symbolMapping
353      *            the new symbol mapping
354      */
355     public void setSymbolMapping(SymbolMapping symbolMapping) {
356         this.symbolMapping = symbolMapping;
357     }
358 
359     /**
360      * Gets the symbol mapping.
361      *
362      * @return the symbol mapping
363      */
364     public SymbolMapping getSymbolMapping() {
365         return this.symbolMapping;
366     }
367 
368 }