View Javadoc
1   /*
2    * CSVeed (https://github.com/42BV/CSVeed)
3    *
4    * Copyright 2013-2023 CSVeed.
5    *
6    * All rights reserved. This program and the accompanying materials
7    * are made available under the terms of The Apache Software License,
8    * Version 2.0 which accompanies this distribution, and is available at
9    * https://www.apache.org/licenses/LICENSE-2.0.txt
10   */
11  package org.csveed.token;
12  
13  import static org.csveed.token.EncounteredSymbol.END_OF_FILE_SYMBOL;
14  import static org.csveed.token.EncounteredSymbol.EOL_SYMBOL;
15  import static org.csveed.token.EncounteredSymbol.EOL_SYMBOL_TRASH;
16  import static org.csveed.token.EncounteredSymbol.ESCAPE_SYMBOL;
17  import static org.csveed.token.EncounteredSymbol.OTHER_SYMBOL;
18  import static org.csveed.token.EncounteredSymbol.QUOTE_SYMBOL;
19  
20  import java.util.Map;
21  import java.util.TreeMap;
22  
23  import org.csveed.report.CsvException;
24  import org.csveed.report.GeneralError;
25  import org.slf4j.Logger;
26  import org.slf4j.LoggerFactory;
27  
28  /**
29   * The Class SymbolMapping.
30   */
31  public class SymbolMapping {
32  
33      /** The Constant LOG. */
34      private static final Logger LOG = LoggerFactory.getLogger(SymbolMapping.class);
35  
36      /** The symbol to chars. */
37      private Map<EncounteredSymbol, char[]> symbolToChars = new TreeMap<>();
38  
39      /** The char to symbol. */
40      private Map<Character, EncounteredSymbol> charToSymbol = new TreeMap<>();
41  
42      /** The escape character. */
43      private Character escapeCharacter;
44  
45      /** The quote character. */
46      private Character quoteCharacter;
47  
48      /** The settings logged. */
49      private boolean settingsLogged;
50  
51      /** The start line. */
52      private int startLine = 1;
53  
54      /** The skip comment lines. */
55      private boolean skipCommentLines = true;
56  
57      /**
58       * The accepted end of line.
59       * <p>
60       * When multiple EOL characters have been given, only the first one encountered will be accepted.
61       */
62      private char acceptedEndOfLine;
63  
64      /**
65       * Instantiates a new symbol mapping.
66       */
67      public SymbolMapping() {
68          initDefaultMapping();
69      }
70  
71      /**
72       * Inits the default mapping.
73       */
74      public void initDefaultMapping() {
75          addMapping(EncounteredSymbol.ESCAPE_SYMBOL, '"');
76          addMapping(EncounteredSymbol.QUOTE_SYMBOL, '"');
77          addMapping(EncounteredSymbol.SEPARATOR_SYMBOL, ';');
78          addMapping(EncounteredSymbol.EOL_SYMBOL, new char[] { '\r', '\n' });
79          addMapping(EncounteredSymbol.SPACE_SYMBOL, ' ');
80          addMapping(EncounteredSymbol.BOM_SYMBOL, '\uFEFF');
81          addMapping(EncounteredSymbol.COMMENT_SYMBOL, '#');
82      }
83  
84      /**
85       * Gets the first mapped character.
86       *
87       * @param encounteredSymbol
88       *            the encountered symbol
89       *
90       * @return the first mapped character
91       */
92      public char getFirstMappedCharacter(EncounteredSymbol encounteredSymbol) {
93          char[] mappedCharacters = getMappedCharacters(encounteredSymbol);
94          return mappedCharacters == null ? 0 : mappedCharacters[0];
95      }
96  
97      /**
98       * Gets the mapped characters.
99       *
100      * @param encounteredSymbol
101      *            the encountered symbol
102      *
103      * @return the mapped characters
104      */
105     public char[] getMappedCharacters(EncounteredSymbol encounteredSymbol) {
106         return symbolToChars.get(encounteredSymbol);
107     }
108 
109     /**
110      * Adds the mapping.
111      *
112      * @param symbol
113      *            the symbol
114      * @param character
115      *            the character
116      */
117     public void addMapping(EncounteredSymbol symbol, Character character) {
118         addMapping(symbol, new char[] { character });
119         if (symbol.isCheckForSimilarEscapeAndQuote()) {
120             storeCharacterForLaterComparison(symbol, character);
121         }
122     }
123 
124     /**
125      * Adds the mapping.
126      *
127      * @param symbol
128      *            the symbol
129      * @param characters
130      *            the characters
131      */
132     public void addMapping(EncounteredSymbol symbol, char[] characters) {
133         while (charToSymbol.values().remove(symbol)) {
134             // Looping until all symbols removed
135         }
136         for (Character character : characters) {
137             charToSymbol.put(character, symbol);
138         }
139         symbolToChars.put(symbol, characters);
140     }
141 
142     /**
143      * Log settings.
144      */
145     public void logSettings() {
146         if (settingsLogged) {
147             return;
148         }
149         LOG.info("- CSV config / skip comment lines? {}", isSkipCommentLines() ? "yes" : "no");
150         LOG.info("- CSV config / start line: {}", startLine);
151         for (Map.Entry<EncounteredSymbol, char[]> entry : symbolToChars.entrySet()) {
152             char[] characters = entry.getValue();
153             if (LOG.isInfoEnabled()) {
154                 LOG.info("- CSV config / Characters for {} {}", entry.getKey(), charactersToString(characters));
155             }
156         }
157         settingsLogged = true;
158     }
159 
160     /**
161      * Characters to string.
162      *
163      * @param characters
164      *            the characters
165      *
166      * @return the string
167      */
168     private String charactersToString(char[] characters) {
169         StringBuilder returnString = new StringBuilder();
170         for (char currentChar : characters) {
171             returnString.append(charToPrintable(currentChar));
172             returnString.append(" ");
173         }
174         return returnString.toString();
175     }
176 
177     /**
178      * Char to printable.
179      *
180      * @param character
181      *            the character
182      *
183      * @return the string
184      */
185     private String charToPrintable(char character) {
186         switch (character) {
187             case '\t':
188                 return "\\t";
189             case '\n':
190                 return "\\n";
191             case '\r':
192                 return "\\r";
193             default:
194                 return Character.toString(character);
195         }
196     }
197 
198     /**
199      * Store character for later comparison.
200      *
201      * @param symbol
202      *            the symbol
203      * @param character
204      *            the character
205      */
206     private void storeCharacterForLaterComparison(EncounteredSymbol symbol, Character character) {
207         if (symbol == ESCAPE_SYMBOL) {
208             escapeCharacter = character;
209         } else if (symbol == QUOTE_SYMBOL) {
210             quoteCharacter = character;
211         }
212     }
213 
214     /**
215      * Checks if is same characters for escape and quote.
216      *
217      * @return true, if is same characters for escape and quote
218      */
219     public boolean isSameCharactersForEscapeAndQuote() {
220         return escapeCharacter != null && quoteCharacter != null && escapeCharacter.equals(quoteCharacter);
221     }
222 
223     /**
224      * Find.
225      *
226      * @param character
227      *            the character
228      * @param parseState
229      *            the parse state
230      *
231      * @return the encountered symbol
232      */
233     public EncounteredSymbol find(int character, ParseState parseState) {
234         if (character == -1) {
235             return END_OF_FILE_SYMBOL;
236         }
237         EncounteredSymbol symbol = charToSymbol.get((char) character);
238         if (symbol == null) {
239             return OTHER_SYMBOL;
240         }
241         if (symbol == EOL_SYMBOL) {
242             if (acceptedEndOfLine == 0) {
243                 LOG.info("- Triggering EOL character: {}", character);
244                 acceptedEndOfLine = (char) character;
245             }
246             if (acceptedEndOfLine != character) {
247                 symbol = EOL_SYMBOL_TRASH;
248             }
249         }
250         if (symbol.isCheckForSimilarEscapeAndQuote() && isSameCharactersForEscapeAndQuote()) {
251             return parseState.isUpgradeQuoteToEscape() ? ESCAPE_SYMBOL : QUOTE_SYMBOL;
252         }
253         return symbol;
254     }
255 
256     /**
257      * Gets the start line.
258      *
259      * @return the start line
260      */
261     public int getStartLine() {
262         return startLine;
263     }
264 
265     /**
266      * Sets the start line.
267      *
268      * @param startLine
269      *            the new start line
270      */
271     public void setStartLine(int startLine) {
272         if (startLine == 0) {
273             throw new CsvException(new GeneralError("Row cannot be set at 0. Rows are 1-based"));
274         }
275         this.startLine = startLine;
276     }
277 
278     /**
279      * Checks if is skip comment lines.
280      *
281      * @return true, if is skip comment lines
282      */
283     public boolean isSkipCommentLines() {
284         return skipCommentLines;
285     }
286 
287     /**
288      * Sets the skip comment lines.
289      *
290      * @param skipCommentLines
291      *            the new skip comment lines
292      */
293     public void setSkipCommentLines(boolean skipCommentLines) {
294         this.skipCommentLines = skipCommentLines;
295     }
296 }