001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.csv;
018    
019    import java.io.IOException;
020    import java.io.Reader;
021    import java.io.InputStreamReader;
022    import java.io.InputStream;
023    import java.util.ArrayList;
024    
025    
026    /**
027     * Parses CSV files according to the specified configuration.
028     *
029     * Because CSV appears in many different dialects, the parser supports many
030     * configuration settings by allowing the specification of a {@link CSVStrategy}.
031     *
032     * <p>Parsing of a csv-string having tabs as separators,
033     * '"' as an optional value encapsulator, and comments starting with '#':</p>
034     * <pre>
035     *  String[][] data =
036     *   (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues();
037     * </pre>
038     *
039     * <p>Parsing of a csv-string in Excel CSV format</p>
040     * <pre>
041     *  String[][] data =
042     *   (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues();
043     * </pre>
044     *
045     * <p>
046     * Internal parser state is completely covered by the strategy
047     * and the reader-state.</p>
048     *
049     * <p>see <a href="package-summary.html">package documentation</a>
050     * for more details</p>
051     */
052    public class CSVParser {
053    
054        /**
055         * length of the initial token (content-)buffer
056         */
057        private static final int INITIAL_TOKEN_LENGTH = 50;
058    
059        // the token types
060        /**
061         * Token has no valid content, i.e. is in its initialized state.
062         */
063        protected static final int TT_INVALID = -1;
064        /**
065         * Token with content, at beginning or in the middle of a line.
066         */
067        protected static final int TT_TOKEN = 0;
068        /**
069         * Token (which can have content) when end of file is reached.
070         */
071        protected static final int TT_EOF = 1;
072        /**
073         * Token with content when end of a line is reached.
074         */
075        protected static final int TT_EORECORD = 2;
076    
077        /**
078         * Immutable empty String array.
079         */
080        private static final String[] EMPTY_STRING_ARRAY = new String[0];
081    
082        // the input stream
083        private final ExtendedBufferedReader in;
084    
085        private final CSVStrategy strategy;
086    
087        // the following objects are shared to reduce garbage
088        /**
089         * A record buffer for getLine(). Grows as necessary and is reused.
090         */
091        private final ArrayList record = new ArrayList();
092        private final Token reusableToken = new Token();
093        private final CharBuffer wsBuf = new CharBuffer();
094        private final CharBuffer code = new CharBuffer(4);
095    
096    
097        /**
098         * Token is an internal token representation.
099         * <p/>
100         * It is used as contract between the lexer and the parser.
101         */
102        static class Token {
103            /**
104             * Token type, see TT_xxx constants.
105             */
106            int type = TT_INVALID;
107            /**
108             * The content buffer.
109             */
110            CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
111            /**
112             * Token ready flag: indicates a valid token with content (ready for the parser).
113             */
114            boolean isReady;
115    
116            Token reset() {
117                content.clear();
118                type = TT_INVALID;
119                isReady = false;
120                return this;
121            }
122        }
123    
124        // ======================================================
125        //  the constructor
126        // ======================================================
127    
128        /**
129         * Default strategy for the parser follows the default {@link CSVStrategy}.
130         *
131         * @param input an InputStream containing "csv-formatted" stream
132         * @deprecated use {@link #CSVParser(Reader)}.
133         */
134        public CSVParser(InputStream input) {
135            this(new InputStreamReader(input));
136        }
137    
138        /**
139         * CSV parser using the default {@link CSVStrategy}.
140         *
141         * @param input a Reader containing "csv-formatted" input
142         */
143        public CSVParser(Reader input) {
144            this(input, (CSVStrategy) CSVStrategy.DEFAULT_STRATEGY.clone());
145        }
146    
147        /**
148         * Customized value delimiter parser.
149         * <p/>
150         * The parser follows the default {@link CSVStrategy}
151         * except for the delimiter setting.
152         *
153         * @param input     a Reader based on "csv-formatted" input
154         * @param delimiter a Char used for value separation
155         * @deprecated use {@link #CSVParser(Reader, CSVStrategy)}.
156         */
157        public CSVParser(Reader input, char delimiter) {
158            this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
159        }
160    
161        /**
162         * Customized csv parser.
163         * <p/>
164         * The parser parses according to the given CSV dialect settings.
165         * Leading whitespaces are truncated, unicode escapes are
166         * not interpreted and empty lines are ignored.
167         *
168         * @param input        a Reader based on "csv-formatted" input
169         * @param delimiter    a Char used for value separation
170         * @param encapsulator a Char used as value encapsulation marker
171         * @param commentStart a Char used for comment identification
172         * @deprecated use {@link #CSVParser(Reader, CSVStrategy)}.
173         */
174        public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {
175            this(input, new CSVStrategy(delimiter, encapsulator, commentStart));
176        }
177    
178        /**
179         * Customized CSV parser using the given {@link CSVStrategy}
180         *
181         * @param input    a Reader containing "csv-formatted" input
182         * @param strategy the CSVStrategy used for CSV parsing
183         */
184        public CSVParser(Reader input, CSVStrategy strategy) {
185            this.in = new ExtendedBufferedReader(input);
186            this.strategy = strategy;
187        }
188    
189        // ======================================================
190        //  the parser
191        // ======================================================
192    
193        /**
194         * Parses the CSV according to the given strategy
195         * and returns the content as an array of records
196         * (whereas records are arrays of single values).
197         * <p/>
198         * The returned content starts at the current parse-position in
199         * the stream.
200         *
201         * @return matrix of records x values ('null' when end of file)
202         * @throws IOException on parse error or input read-failure
203         */
204        public String[][] getAllValues() throws IOException {
205            ArrayList records = new ArrayList();
206            String[] values;
207            String[][] ret = null;
208            while ((values = getLine()) != null) {
209                records.add(values);
210            }
211            if (records.size() > 0) {
212                ret = new String[records.size()][];
213                records.toArray(ret);
214            }
215            return ret;
216        }
217    
218        /**
219         * Parses the CSV according to the given strategy
220         * and returns the next csv-value as string.
221         *
222         * @return next value in the input stream ('null' when end of file)
223         * @throws IOException on parse error or input read-failure
224         */
225        public String nextValue() throws IOException {
226            Token tkn = nextToken();
227            String ret = null;
228            switch (tkn.type) {
229                case TT_TOKEN:
230                case TT_EORECORD:
231                    ret = tkn.content.toString();
232                    break;
233                case TT_EOF:
234                    ret = null;
235                    break;
236                case TT_INVALID:
237                default:
238                    // error no token available (or error)
239                    throw new IOException(
240                            "(line " + getLineNumber()
241                                    + ") invalid parse sequence");
242                    // unreachable: break;
243            }
244            return ret;
245        }
246    
247        /**
248         * Parses from the current point in the stream til
249         * the end of the current line.
250         *
251         * @return array of values til end of line
252         *         ('null' when end of file has been reached)
253         * @throws IOException on parse error or input read-failure
254         */
255        public String[] getLine() throws IOException {
256            String[] ret = EMPTY_STRING_ARRAY;
257            record.clear();
258            while (true) {
259                reusableToken.reset();
260                nextToken(reusableToken);
261                switch (reusableToken.type) {
262                    case TT_TOKEN:
263                        record.add(reusableToken.content.toString());
264                        break;
265                    case TT_EORECORD:
266                        record.add(reusableToken.content.toString());
267                        break;
268                    case TT_EOF:
269                        if (reusableToken.isReady) {
270                            record.add(reusableToken.content.toString());
271                        } else {
272                            ret = null;
273                        }
274                        break;
275                    case TT_INVALID:
276                    default:
277                        // error: throw IOException
278                        throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
279                        // unreachable: break;
280                }
281                if (reusableToken.type != TT_TOKEN) {
282                    break;
283                }
284            }
285            if (!record.isEmpty()) {
286                ret = (String[]) record.toArray(new String[record.size()]);
287            }
288            return ret;
289        }
290    
291        /**
292         * Returns the current line number in the input stream.
293         * <p/>
294         * ATTENTION: in case your csv has multiline-values the returned
295         * number does not correspond to the record-number
296         *
297         * @return current line number
298         */
299        public int getLineNumber() {
300            return in.getLineNumber();
301        }
302    
303        // ======================================================
304        //  the lexer(s)
305        // ======================================================
306    
307        /**
308         * Convenience method for <code>nextToken(null)</code>.
309         */
310        protected Token nextToken() throws IOException {
311            return nextToken(new Token());
312        }
313    
314        /**
315         * Returns the next token.
316         * <p/>
317         * A token corresponds to a term, a record change or an
318         * end-of-file indicator.
319         *
320         * @param tkn an existing Token object to reuse. The caller is responsible to initialize the
321         *            Token.
322         * @return the next token found
323         * @throws IOException on stream access error
324         */
325        protected Token nextToken(Token tkn) throws IOException {
326            wsBuf.clear(); // reuse
327    
328            // get the last read char (required for empty line detection)
329            int lastChar = in.readAgain();
330    
331            //  read the next char and set eol
332            /* note: unfortunately isEndOfLine may consumes a character silently.
333            *       this has no effect outside of the method. so a simple workaround
334            *       is to call 'readAgain' on the stream...
335            *       uh: might using objects instead of base-types (jdk1.5 autoboxing!)
336            */
337            int c = in.read();
338            boolean eol = isEndOfLine(c);
339            c = in.readAgain();
340    
341            //  empty line detection: eol AND (last char was EOL or beginning)
342            while (strategy.getIgnoreEmptyLines() && eol
343                    && (lastChar == '\n'
344                    || lastChar == '\r'
345                    || lastChar == ExtendedBufferedReader.UNDEFINED)
346                    && !isEndOfFile(lastChar)) {
347                // go on char ahead ...
348                lastChar = c;
349                c = in.read();
350                eol = isEndOfLine(c);
351                c = in.readAgain();
352                // reached end of file without any content (empty line at the end)
353                if (isEndOfFile(c)) {
354                    tkn.type = TT_EOF;
355                    return tkn;
356                }
357            }
358    
359            // did we reach eof during the last iteration already ? TT_EOF
360            if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
361                tkn.type = TT_EOF;
362                return tkn;
363            }
364    
365            //  important: make sure a new char gets consumed in each iteration
366            while (!tkn.isReady && tkn.type != TT_EOF) {
367                // ignore whitespaces at beginning of a token
368                while (strategy.getIgnoreLeadingWhitespaces() && isWhitespace(c) && !eol) {
369                    wsBuf.append((char) c);
370                    c = in.read();
371                    eol = isEndOfLine(c);
372                }
373                // ok, start of token reached: comment, encapsulated, or token
374                if (c == strategy.getCommentStart()) {
375                    // ignore everything till end of line and continue (incr linecount)
376                    in.readLine();
377                    tkn = nextToken(tkn.reset());
378                } else if (c == strategy.getDelimiter()) {
379                    // empty token return TT_TOKEN("")
380                    tkn.type = TT_TOKEN;
381                    tkn.isReady = true;
382                } else if (eol) {
383                    // empty token return TT_EORECORD("")
384                    //noop: tkn.content.append("");
385                    tkn.type = TT_EORECORD;
386                    tkn.isReady = true;
387                } else if (c == strategy.getEncapsulator()) {
388                    // consume encapsulated token
389                    encapsulatedTokenLexer(tkn, c);
390                } else if (isEndOfFile(c)) {
391                    // end of file return TT_EOF()
392                    //noop: tkn.content.append("");
393                    tkn.type = TT_EOF;
394                    tkn.isReady = true;
395                } else {
396                    // next token must be a simple token
397                    // add removed blanks when not ignoring whitespace chars...
398                    if (!strategy.getIgnoreLeadingWhitespaces()) {
399                        tkn.content.append(wsBuf);
400                    }
401                    simpleTokenLexer(tkn, c);
402                }
403            }
404            return tkn;
405        }
406    
407        /**
408         * A simple token lexer
409         * <p/>
410         * Simple token are tokens which are not surrounded by encapsulators.
411         * A simple token might contain escaped delimiters (as \, or \;). The
412         * token is finished when one of the following conditions become true:
413         * <ul>
414         * <li>end of line has been reached (TT_EORECORD)</li>
415         * <li>end of stream has been reached (TT_EOF)</li>
416         * <li>an unescaped delimiter has been reached (TT_TOKEN)</li>
417         * </ul>
418         *
419         * @param tkn the current token
420         * @param c   the current character
421         * @return the filled token
422         * @throws IOException on stream access error
423         */
424        private Token simpleTokenLexer(Token tkn, int c) throws IOException {
425            for (; ;) {
426                if (isEndOfLine(c)) {
427                    // end of record
428                    tkn.type = TT_EORECORD;
429                    tkn.isReady = true;
430                    break;
431                } else if (isEndOfFile(c)) {
432                    // end of file
433                    tkn.type = TT_EOF;
434                    tkn.isReady = true;
435                    break;
436                } else if (c == strategy.getDelimiter()) {
437                    // end of token
438                    tkn.type = TT_TOKEN;
439                    tkn.isReady = true;
440                    break;
441                } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
442                    // interpret unicode escaped chars (like \u0070 -> p)
443                    tkn.content.append((char) unicodeEscapeLexer(c));
444                } else if (c == strategy.getEscape()) {
445                    tkn.content.append((char) readEscape(c));
446                } else {
447                    tkn.content.append((char) c);
448                }
449    
450                c = in.read();
451            }
452    
453            if (strategy.getIgnoreTrailingWhitespaces()) {
454                tkn.content.trimTrailingWhitespace();
455            }
456    
457            return tkn;
458        }
459    
460    
461        /**
462         * An encapsulated token lexer
463         * <p/>
464         * Encapsulated tokens are surrounded by the given encapsulating-string.
465         * The encapsulator itself might be included in the token using a
466         * doubling syntax (as "", '') or using escaping (as in \", \').
467         * Whitespaces before and after an encapsulated token are ignored.
468         *
469         * @param tkn the current token
470         * @param c   the current character
471         * @return a valid token object
472         * @throws IOException on invalid state
473         */
474        private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
475            // save current line
476            int startLineNumber = getLineNumber();
477            // ignore the given delimiter
478            // assert c == delimiter;
479            for (; ;) {
480                c = in.read();
481    
482                if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
483                    tkn.content.append((char) unicodeEscapeLexer(c));
484                } else if (c == strategy.getEscape()) {
485                    tkn.content.append((char) readEscape(c));
486                } else if (c == strategy.getEncapsulator()) {
487                    if (in.lookAhead() == strategy.getEncapsulator()) {
488                        // double or escaped encapsulator -> add single encapsulator to token
489                        c = in.read();
490                        tkn.content.append((char) c);
491                    } else {
492                        // token finish mark (encapsulator) reached: ignore whitespace till delimiter
493                        for (; ;) {
494                            c = in.read();
495                            if (c == strategy.getDelimiter()) {
496                                tkn.type = TT_TOKEN;
497                                tkn.isReady = true;
498                                return tkn;
499                            } else if (isEndOfFile(c)) {
500                                tkn.type = TT_EOF;
501                                tkn.isReady = true;
502                                return tkn;
503                            } else if (isEndOfLine(c)) {
504                                // ok eo token reached
505                                tkn.type = TT_EORECORD;
506                                tkn.isReady = true;
507                                return tkn;
508                            } else if (!isWhitespace(c)) {
509                                // error invalid char between token and next delimiter
510                                throw new IOException(
511                                        "(line " + getLineNumber()
512                                                + ") invalid char between encapsulated token end delimiter"
513                                );
514                            }
515                        }
516                    }
517                } else if (isEndOfFile(c)) {
518                    // error condition (end of file before end of token)
519                    throw new IOException(
520                            "(startline " + startLineNumber + ")"
521                                    + "eof reached before encapsulated token finished"
522                    );
523                } else {
524                    // consume character
525                    tkn.content.append((char) c);
526                }
527            }
528        }
529    
530    
531        /**
532         * Decodes Unicode escapes.
533         * <p/>
534         * Interpretation of "\\uXXXX" escape sequences
535         * where XXXX is a hex-number.
536         *
537         * @param c current char which is discarded because it's the "\\" of "\\uXXXX"
538         * @return the decoded character
539         * @throws IOException on wrong unicode escape sequence or read error
540         */
541        protected int unicodeEscapeLexer(int c) throws IOException {
542            int ret = 0;
543            // ignore 'u' (assume c==\ now) and read 4 hex digits
544            c = in.read();
545            code.clear();
546            try {
547                for (int i = 0; i < 4; i++) {
548                    c = in.read();
549                    if (isEndOfFile(c) || isEndOfLine(c)) {
550                        throw new NumberFormatException("number too short");
551                    }
552                    code.append((char) c);
553                }
554                ret = Integer.parseInt(code.toString(), 16);
555            } catch (NumberFormatException e) {
556                throw new IOException(
557                        "(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
558                                + code.toString() + "'" + e.toString());
559            }
560            return ret;
561        }
562    
563        private int readEscape(int c) throws IOException {
564            // assume c is the escape char (normally a backslash)
565            c = in.read();
566            int out;
567            switch (c) {
568                case 'r':
569                    out = '\r';
570                    break;
571                case 'n':
572                    out = '\n';
573                    break;
574                case 't':
575                    out = '\t';
576                    break;
577                case 'b':
578                    out = '\b';
579                    break;
580                case 'f':
581                    out = '\f';
582                    break;
583                default:
584                    out = c;
585            }
586            return out;
587        }
588    
589        // ======================================================
590        //  strategies
591        // ======================================================
592    
593        /**
594         * Obtain the specified CSV Strategy.  This should not be modified.
595         *
596         * @return strategy currently being used
597         */
598        public CSVStrategy getStrategy() {
599            return this.strategy;
600        }
601    
602        // ======================================================
603        //  Character class checker
604        // ======================================================
605    
606        /**
607         * @return true if the given char is a whitespace character
608         */
609        private boolean isWhitespace(int c) {
610            return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
611        }
612    
613        /**
614         * Greedy - accepts \n, \r and \r\n
615         * This checker consumes silently the second control-character...
616         *
617         * @return true if the given character is a line-terminator
618         */
619        private boolean isEndOfLine(int c) throws IOException {
620            // check if we have \r\n...
621            if (c == '\r') {
622                if (in.lookAhead() == '\n') {
623                    // note: does not change c outside of this method !!
624                    c = in.read();
625                }
626            }
627            return (c == '\n' || c == '\r');
628        }
629    
630        /**
631         * @return true if the given character indicates end of file
632         */
633        private boolean isEndOfFile(int c) {
634            return c == ExtendedBufferedReader.END_OF_STREAM;
635        }
636    }