1
2
3
4
5
6
7
8
9
10
11 package org.csveed.token;
12
13 import static org.csveed.token.ParseState.COMMENT_LINE;
14 import static org.csveed.token.ParseState.COMMENT_LINE_FINISHED;
15 import static org.csveed.token.ParseState.ESCAPING;
16 import static org.csveed.token.ParseState.FINISHED;
17 import static org.csveed.token.ParseState.FIRST_CHAR_INSIDE_QUOTED_FIELD;
18 import static org.csveed.token.ParseState.INSIDE_FIELD;
19 import static org.csveed.token.ParseState.INSIDE_QUOTED_FIELD;
20 import static org.csveed.token.ParseState.LINE_FINISHED;
21 import static org.csveed.token.ParseState.OUTSIDE_AFTER_FIELD;
22 import static org.csveed.token.ParseState.OUTSIDE_BEFORE_FIELD;
23 import static org.csveed.token.ParseState.SEPARATOR;
24 import static org.csveed.token.ParseState.SKIP_LINE;
25 import static org.csveed.token.ParseState.SKIP_LINE_FINISHED;
26 import static org.csveed.token.ParseState.START_OF_LINE;
27
28 import org.csveed.common.Column;
29 import org.slf4j.Logger;
30 import org.slf4j.LoggerFactory;
31
32
33
34
35
36
37 public class ParseStateMachine {
38
39
40 private static final Logger LOG = LoggerFactory.getLogger(ParseStateMachine.class);
41
42
43 private ParseState state = START_OF_LINE;
44
45
46 private StringBuilder token = new StringBuilder();
47
48
49 private int charactersRead;
50
51
52 private SymbolMapping symbolMapping = new SymbolMapping();
53
54
55 private TokenState tokenState = TokenState.RESET;
56
57
58 private boolean trim = true;
59
60
61 private boolean trash;
62
63
64 private Column currentColumn = new Column();
65
66
67 private int currentLine = 1;
68
69
70 private int newLine = currentLine;
71
72
73
74
75
76
77 public int getCurrentLine() {
78 return this.currentLine;
79 }
80
81
82
83
84
85
86 public int getCurrentColumn() {
87 return this.currentColumn.getColumnIndex();
88 }
89
90
91
92
93
94
95 public boolean isTrash() {
96 return this.trash;
97 }
98
99
100
101
102
103
104
105
106
107
108
109
110 public String offerSymbol(int symbolCharacter) throws ParseException {
111
112 this.trash = false;
113
114 EncounteredSymbol symbol = symbolMapping.find(symbolCharacter, state);
115
116 if (symbol.isTrash()) {
117 this.trash = true;
118 return null;
119 }
120
121 if (isFinished()) {
122 throw new ParseException(state, symbolCharacter, symbol);
123 }
124
125 if (currentLine != newLine) {
126 state = START_OF_LINE;
127 charactersRead = 0;
128 currentColumn = currentColumn.nextLine();
129 currentLine = newLine;
130 }
131
132 if (currentLine < symbolMapping.getStartLine()) {
133 state = SKIP_LINE;
134 }
135
136 if (tokenState.isStart()) {
137 tokenState = tokenState.next();
138 }
139
140 ParseState newState = determineState(symbolCharacter, symbol);
141 LOG.debug("{} ({}): {} => {}", (char) symbolCharacter, symbol, state, newState);
142
143 if (newState.isTokenize()) {
144 if (tokenState.isReset()) {
145 trim = newState.trim();
146 tokenState = tokenState.next();
147 }
148 token.append((char) symbolCharacter);
149 }
150 String returnToken = null;
151
152 if (newState.isPopToken()) {
153 returnToken = token.toString();
154 if (trim) {
155 returnToken = returnToken.trim();
156 }
157 token = new StringBuilder();
158 tokenState = tokenState.next();
159 currentColumn = currentColumn.nextColumn();
160 }
161
162 if (newState.isLineFinished()) {
163 newLine++;
164 } else {
165 charactersRead++;
166 }
167
168 state = newState;
169
170 return returnToken;
171 }
172
173
174
175
176
177
178 public boolean isTokenStart() {
179 return tokenState.isStart();
180 }
181
182
183
184
185
186
187 public boolean isLineFinished() {
188 return state.isLineFinished();
189 }
190
191
192
193
194
195
196 public boolean isFinished() {
197 return state == FINISHED;
198 }
199
200
201
202
203
204
205 public boolean ignoreLine() {
206 return state.isIgnore() || isEmptyLine();
207 }
208
209
210
211
212
213
214 public boolean isEmptyLine() {
215 return charactersRead == 0;
216 }
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231 protected ParseState determineState(int symbolCharacter, EncounteredSymbol symbol) throws ParseException {
232
233 switch (state) {
234 case SKIP_LINE:
235 switch (symbol) {
236 case EOL_SYMBOL:
237 return SKIP_LINE_FINISHED;
238 case END_OF_FILE_SYMBOL:
239 return FINISHED;
240 default:
241 return SKIP_LINE;
242 }
243 case COMMENT_LINE:
244 switch (symbol) {
245 case EOL_SYMBOL:
246 return COMMENT_LINE_FINISHED;
247 case END_OF_FILE_SYMBOL:
248 return FINISHED;
249 default:
250 return COMMENT_LINE;
251 }
252 case START_OF_LINE:
253 if (EncounteredSymbol.COMMENT_SYMBOL.equals(symbol) && symbolMapping.isSkipCommentLines()) {
254 return COMMENT_LINE;
255 }
256
257 case SEPARATOR:
258 switch (symbol) {
259 case SPACE_SYMBOL:
260 return OUTSIDE_BEFORE_FIELD;
261 case QUOTE_SYMBOL:
262 return FIRST_CHAR_INSIDE_QUOTED_FIELD;
263 case SEPARATOR_SYMBOL:
264 return SEPARATOR;
265 case END_OF_FILE_SYMBOL:
266 return FINISHED;
267 case EOL_SYMBOL:
268 return LINE_FINISHED;
269 default:
270 return INSIDE_FIELD;
271 }
272 case OUTSIDE_BEFORE_FIELD:
273 switch (symbol) {
274 case SPACE_SYMBOL:
275 return OUTSIDE_BEFORE_FIELD;
276 case SEPARATOR_SYMBOL:
277 return SEPARATOR;
278 case END_OF_FILE_SYMBOL:
279 return FINISHED;
280 case EOL_SYMBOL:
281 return LINE_FINISHED;
282 case QUOTE_SYMBOL:
283 return FIRST_CHAR_INSIDE_QUOTED_FIELD;
284 default:
285 return INSIDE_FIELD;
286 }
287 case OUTSIDE_AFTER_FIELD:
288 switch (symbol) {
289 case SPACE_SYMBOL:
290 return OUTSIDE_AFTER_FIELD;
291 case SEPARATOR_SYMBOL:
292 return SEPARATOR;
293 case END_OF_FILE_SYMBOL:
294 return FINISHED;
295 case EOL_SYMBOL:
296 return LINE_FINISHED;
297 default:
298 throw new ParseException(state, symbolCharacter, symbol);
299 }
300 case INSIDE_FIELD:
301 switch (symbol) {
302 case SEPARATOR_SYMBOL:
303 return SEPARATOR;
304 case END_OF_FILE_SYMBOL:
305 return FINISHED;
306 case EOL_SYMBOL:
307 return LINE_FINISHED;
308 case QUOTE_SYMBOL:
309 throw new ParseException(state, symbolCharacter, symbol);
310 default:
311 return INSIDE_FIELD;
312 }
313 case FIRST_CHAR_INSIDE_QUOTED_FIELD:
314 case INSIDE_QUOTED_FIELD:
315 switch (symbol) {
316 case QUOTE_SYMBOL:
317 return OUTSIDE_AFTER_FIELD;
318 case ESCAPE_SYMBOL:
319 return ESCAPING;
320 case END_OF_FILE_SYMBOL:
321 throw new ParseException(state, symbolCharacter, symbol);
322 default:
323 return INSIDE_QUOTED_FIELD;
324 }
325 case ESCAPING:
326 if (symbolMapping.isSameCharactersForEscapeAndQuote()) {
327 switch (symbol) {
328 case SPACE_SYMBOL:
329 return OUTSIDE_AFTER_FIELD;
330 case QUOTE_SYMBOL:
331 return INSIDE_QUOTED_FIELD;
332 case EOL_SYMBOL:
333 return LINE_FINISHED;
334 case SEPARATOR_SYMBOL:
335 return SEPARATOR;
336 case END_OF_FILE_SYMBOL:
337 return FINISHED;
338 default:
339 throw new ParseException(state, symbolCharacter, symbol);
340 }
341 }
342
343 return INSIDE_QUOTED_FIELD;
344 default:
345 throw new ParseException(state, symbolCharacter, symbol);
346 }
347 }
348
349
350
351
352
353
354
355 public void setSymbolMapping(SymbolMapping symbolMapping) {
356 this.symbolMapping = symbolMapping;
357 }
358
359
360
361
362
363
364 public SymbolMapping getSymbolMapping() {
365 return this.symbolMapping;
366 }
367
368 }