001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.csv; 018 019 import java.io.IOException; 020 import java.io.Reader; 021 import java.io.InputStreamReader; 022 import java.io.InputStream; 023 import java.util.ArrayList; 024 025 026 /** 027 * Parses CSV files according to the specified configuration. 028 * 029 * Because CSV appears in many different dialects, the parser supports many 030 * configuration settings by allowing the specification of a {@link CSVStrategy}. 031 * 032 * <p>Parsing of a csv-string having tabs as separators, 033 * '"' as an optional value encapsulator, and comments starting with '#':</p> 034 * <pre> 035 * String[][] data = 036 * (new CSVParser(new StringReader("a\tb\nc\td"), new CSVStrategy('\t','"','#'))).getAllValues(); 037 * </pre> 038 * 039 * <p>Parsing of a csv-string in Excel CSV format</p> 040 * <pre> 041 * String[][] data = 042 * (new CSVParser(new StringReader("a;b\nc;d"), CSVStrategy.EXCEL_STRATEGY)).getAllValues(); 043 * </pre> 044 * 045 * <p> 046 * Internal parser state is completely covered by the strategy 047 * and the reader-state.</p> 048 * 049 * <p>see <a href="package-summary.html">package documentation</a> 050 * for more details</p> 051 */ 052 public class CSVParser { 053 054 /** 055 * length of the initial token (content-)buffer 056 */ 057 private static final int INITIAL_TOKEN_LENGTH = 50; 058 059 // the token types 060 /** 061 * Token has no valid content, i.e. is in its initialized state. 062 */ 063 protected static final int TT_INVALID = -1; 064 /** 065 * Token with content, at beginning or in the middle of a line. 066 */ 067 protected static final int TT_TOKEN = 0; 068 /** 069 * Token (which can have content) when end of file is reached. 070 */ 071 protected static final int TT_EOF = 1; 072 /** 073 * Token with content when end of a line is reached. 074 */ 075 protected static final int TT_EORECORD = 2; 076 077 /** 078 * Immutable empty String array. 079 */ 080 private static final String[] EMPTY_STRING_ARRAY = new String[0]; 081 082 // the input stream 083 private final ExtendedBufferedReader in; 084 085 private final CSVStrategy strategy; 086 087 // the following objects are shared to reduce garbage 088 /** 089 * A record buffer for getLine(). Grows as necessary and is reused. 090 */ 091 private final ArrayList record = new ArrayList(); 092 private final Token reusableToken = new Token(); 093 private final CharBuffer wsBuf = new CharBuffer(); 094 private final CharBuffer code = new CharBuffer(4); 095 096 097 /** 098 * Token is an internal token representation. 099 * <p/> 100 * It is used as contract between the lexer and the parser. 101 */ 102 static class Token { 103 /** 104 * Token type, see TT_xxx constants. 105 */ 106 int type = TT_INVALID; 107 /** 108 * The content buffer. 109 */ 110 CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH); 111 /** 112 * Token ready flag: indicates a valid token with content (ready for the parser). 113 */ 114 boolean isReady; 115 116 Token reset() { 117 content.clear(); 118 type = TT_INVALID; 119 isReady = false; 120 return this; 121 } 122 } 123 124 // ====================================================== 125 // the constructor 126 // ====================================================== 127 128 /** 129 * Default strategy for the parser follows the default {@link CSVStrategy}. 130 * 131 * @param input an InputStream containing "csv-formatted" stream 132 * @deprecated use {@link #CSVParser(Reader)}. 133 */ 134 public CSVParser(InputStream input) { 135 this(new InputStreamReader(input)); 136 } 137 138 /** 139 * CSV parser using the default {@link CSVStrategy}. 140 * 141 * @param input a Reader containing "csv-formatted" input 142 */ 143 public CSVParser(Reader input) { 144 this(input, (CSVStrategy) CSVStrategy.DEFAULT_STRATEGY.clone()); 145 } 146 147 /** 148 * Customized value delimiter parser. 149 * <p/> 150 * The parser follows the default {@link CSVStrategy} 151 * except for the delimiter setting. 152 * 153 * @param input a Reader based on "csv-formatted" input 154 * @param delimiter a Char used for value separation 155 * @deprecated use {@link #CSVParser(Reader, CSVStrategy)}. 156 */ 157 public CSVParser(Reader input, char delimiter) { 158 this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED); 159 } 160 161 /** 162 * Customized csv parser. 163 * <p/> 164 * The parser parses according to the given CSV dialect settings. 165 * Leading whitespaces are truncated, unicode escapes are 166 * not interpreted and empty lines are ignored. 167 * 168 * @param input a Reader based on "csv-formatted" input 169 * @param delimiter a Char used for value separation 170 * @param encapsulator a Char used as value encapsulation marker 171 * @param commentStart a Char used for comment identification 172 * @deprecated use {@link #CSVParser(Reader, CSVStrategy)}. 173 */ 174 public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) { 175 this(input, new CSVStrategy(delimiter, encapsulator, commentStart)); 176 } 177 178 /** 179 * Customized CSV parser using the given {@link CSVStrategy} 180 * 181 * @param input a Reader containing "csv-formatted" input 182 * @param strategy the CSVStrategy used for CSV parsing 183 */ 184 public CSVParser(Reader input, CSVStrategy strategy) { 185 this.in = new ExtendedBufferedReader(input); 186 this.strategy = strategy; 187 } 188 189 // ====================================================== 190 // the parser 191 // ====================================================== 192 193 /** 194 * Parses the CSV according to the given strategy 195 * and returns the content as an array of records 196 * (whereas records are arrays of single values). 197 * <p/> 198 * The returned content starts at the current parse-position in 199 * the stream. 200 * 201 * @return matrix of records x values ('null' when end of file) 202 * @throws IOException on parse error or input read-failure 203 */ 204 public String[][] getAllValues() throws IOException { 205 ArrayList records = new ArrayList(); 206 String[] values; 207 String[][] ret = null; 208 while ((values = getLine()) != null) { 209 records.add(values); 210 } 211 if (records.size() > 0) { 212 ret = new String[records.size()][]; 213 records.toArray(ret); 214 } 215 return ret; 216 } 217 218 /** 219 * Parses the CSV according to the given strategy 220 * and returns the next csv-value as string. 221 * 222 * @return next value in the input stream ('null' when end of file) 223 * @throws IOException on parse error or input read-failure 224 */ 225 public String nextValue() throws IOException { 226 Token tkn = nextToken(); 227 String ret = null; 228 switch (tkn.type) { 229 case TT_TOKEN: 230 case TT_EORECORD: 231 ret = tkn.content.toString(); 232 break; 233 case TT_EOF: 234 ret = null; 235 break; 236 case TT_INVALID: 237 default: 238 // error no token available (or error) 239 throw new IOException( 240 "(line " + getLineNumber() 241 + ") invalid parse sequence"); 242 // unreachable: break; 243 } 244 return ret; 245 } 246 247 /** 248 * Parses from the current point in the stream til 249 * the end of the current line. 250 * 251 * @return array of values til end of line 252 * ('null' when end of file has been reached) 253 * @throws IOException on parse error or input read-failure 254 */ 255 public String[] getLine() throws IOException { 256 String[] ret = EMPTY_STRING_ARRAY; 257 record.clear(); 258 while (true) { 259 reusableToken.reset(); 260 nextToken(reusableToken); 261 switch (reusableToken.type) { 262 case TT_TOKEN: 263 record.add(reusableToken.content.toString()); 264 break; 265 case TT_EORECORD: 266 record.add(reusableToken.content.toString()); 267 break; 268 case TT_EOF: 269 if (reusableToken.isReady) { 270 record.add(reusableToken.content.toString()); 271 } else { 272 ret = null; 273 } 274 break; 275 case TT_INVALID: 276 default: 277 // error: throw IOException 278 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence"); 279 // unreachable: break; 280 } 281 if (reusableToken.type != TT_TOKEN) { 282 break; 283 } 284 } 285 if (!record.isEmpty()) { 286 ret = (String[]) record.toArray(new String[record.size()]); 287 } 288 return ret; 289 } 290 291 /** 292 * Returns the current line number in the input stream. 293 * <p/> 294 * ATTENTION: in case your csv has multiline-values the returned 295 * number does not correspond to the record-number 296 * 297 * @return current line number 298 */ 299 public int getLineNumber() { 300 return in.getLineNumber(); 301 } 302 303 // ====================================================== 304 // the lexer(s) 305 // ====================================================== 306 307 /** 308 * Convenience method for <code>nextToken(null)</code>. 309 */ 310 protected Token nextToken() throws IOException { 311 return nextToken(new Token()); 312 } 313 314 /** 315 * Returns the next token. 316 * <p/> 317 * A token corresponds to a term, a record change or an 318 * end-of-file indicator. 319 * 320 * @param tkn an existing Token object to reuse. The caller is responsible to initialize the 321 * Token. 322 * @return the next token found 323 * @throws IOException on stream access error 324 */ 325 protected Token nextToken(Token tkn) throws IOException { 326 wsBuf.clear(); // reuse 327 328 // get the last read char (required for empty line detection) 329 int lastChar = in.readAgain(); 330 331 // read the next char and set eol 332 /* note: unfortunately isEndOfLine may consumes a character silently. 333 * this has no effect outside of the method. so a simple workaround 334 * is to call 'readAgain' on the stream... 335 * uh: might using objects instead of base-types (jdk1.5 autoboxing!) 336 */ 337 int c = in.read(); 338 boolean eol = isEndOfLine(c); 339 c = in.readAgain(); 340 341 // empty line detection: eol AND (last char was EOL or beginning) 342 while (strategy.getIgnoreEmptyLines() && eol 343 && (lastChar == '\n' 344 || lastChar == '\r' 345 || lastChar == ExtendedBufferedReader.UNDEFINED) 346 && !isEndOfFile(lastChar)) { 347 // go on char ahead ... 348 lastChar = c; 349 c = in.read(); 350 eol = isEndOfLine(c); 351 c = in.readAgain(); 352 // reached end of file without any content (empty line at the end) 353 if (isEndOfFile(c)) { 354 tkn.type = TT_EOF; 355 return tkn; 356 } 357 } 358 359 // did we reach eof during the last iteration already ? TT_EOF 360 if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) { 361 tkn.type = TT_EOF; 362 return tkn; 363 } 364 365 // important: make sure a new char gets consumed in each iteration 366 while (!tkn.isReady && tkn.type != TT_EOF) { 367 // ignore whitespaces at beginning of a token 368 while (strategy.getIgnoreLeadingWhitespaces() && isWhitespace(c) && !eol) { 369 wsBuf.append((char) c); 370 c = in.read(); 371 eol = isEndOfLine(c); 372 } 373 // ok, start of token reached: comment, encapsulated, or token 374 if (c == strategy.getCommentStart()) { 375 // ignore everything till end of line and continue (incr linecount) 376 in.readLine(); 377 tkn = nextToken(tkn.reset()); 378 } else if (c == strategy.getDelimiter()) { 379 // empty token return TT_TOKEN("") 380 tkn.type = TT_TOKEN; 381 tkn.isReady = true; 382 } else if (eol) { 383 // empty token return TT_EORECORD("") 384 //noop: tkn.content.append(""); 385 tkn.type = TT_EORECORD; 386 tkn.isReady = true; 387 } else if (c == strategy.getEncapsulator()) { 388 // consume encapsulated token 389 encapsulatedTokenLexer(tkn, c); 390 } else if (isEndOfFile(c)) { 391 // end of file return TT_EOF() 392 //noop: tkn.content.append(""); 393 tkn.type = TT_EOF; 394 tkn.isReady = true; 395 } else { 396 // next token must be a simple token 397 // add removed blanks when not ignoring whitespace chars... 398 if (!strategy.getIgnoreLeadingWhitespaces()) { 399 tkn.content.append(wsBuf); 400 } 401 simpleTokenLexer(tkn, c); 402 } 403 } 404 return tkn; 405 } 406 407 /** 408 * A simple token lexer 409 * <p/> 410 * Simple token are tokens which are not surrounded by encapsulators. 411 * A simple token might contain escaped delimiters (as \, or \;). The 412 * token is finished when one of the following conditions become true: 413 * <ul> 414 * <li>end of line has been reached (TT_EORECORD)</li> 415 * <li>end of stream has been reached (TT_EOF)</li> 416 * <li>an unescaped delimiter has been reached (TT_TOKEN)</li> 417 * </ul> 418 * 419 * @param tkn the current token 420 * @param c the current character 421 * @return the filled token 422 * @throws IOException on stream access error 423 */ 424 private Token simpleTokenLexer(Token tkn, int c) throws IOException { 425 for (; ;) { 426 if (isEndOfLine(c)) { 427 // end of record 428 tkn.type = TT_EORECORD; 429 tkn.isReady = true; 430 break; 431 } else if (isEndOfFile(c)) { 432 // end of file 433 tkn.type = TT_EOF; 434 tkn.isReady = true; 435 break; 436 } else if (c == strategy.getDelimiter()) { 437 // end of token 438 tkn.type = TT_TOKEN; 439 tkn.isReady = true; 440 break; 441 } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') { 442 // interpret unicode escaped chars (like \u0070 -> p) 443 tkn.content.append((char) unicodeEscapeLexer(c)); 444 } else if (c == strategy.getEscape()) { 445 tkn.content.append((char) readEscape(c)); 446 } else { 447 tkn.content.append((char) c); 448 } 449 450 c = in.read(); 451 } 452 453 if (strategy.getIgnoreTrailingWhitespaces()) { 454 tkn.content.trimTrailingWhitespace(); 455 } 456 457 return tkn; 458 } 459 460 461 /** 462 * An encapsulated token lexer 463 * <p/> 464 * Encapsulated tokens are surrounded by the given encapsulating-string. 465 * The encapsulator itself might be included in the token using a 466 * doubling syntax (as "", '') or using escaping (as in \", \'). 467 * Whitespaces before and after an encapsulated token are ignored. 468 * 469 * @param tkn the current token 470 * @param c the current character 471 * @return a valid token object 472 * @throws IOException on invalid state 473 */ 474 private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException { 475 // save current line 476 int startLineNumber = getLineNumber(); 477 // ignore the given delimiter 478 // assert c == delimiter; 479 for (; ;) { 480 c = in.read(); 481 482 if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') { 483 tkn.content.append((char) unicodeEscapeLexer(c)); 484 } else if (c == strategy.getEscape()) { 485 tkn.content.append((char) readEscape(c)); 486 } else if (c == strategy.getEncapsulator()) { 487 if (in.lookAhead() == strategy.getEncapsulator()) { 488 // double or escaped encapsulator -> add single encapsulator to token 489 c = in.read(); 490 tkn.content.append((char) c); 491 } else { 492 // token finish mark (encapsulator) reached: ignore whitespace till delimiter 493 for (; ;) { 494 c = in.read(); 495 if (c == strategy.getDelimiter()) { 496 tkn.type = TT_TOKEN; 497 tkn.isReady = true; 498 return tkn; 499 } else if (isEndOfFile(c)) { 500 tkn.type = TT_EOF; 501 tkn.isReady = true; 502 return tkn; 503 } else if (isEndOfLine(c)) { 504 // ok eo token reached 505 tkn.type = TT_EORECORD; 506 tkn.isReady = true; 507 return tkn; 508 } else if (!isWhitespace(c)) { 509 // error invalid char between token and next delimiter 510 throw new IOException( 511 "(line " + getLineNumber() 512 + ") invalid char between encapsulated token end delimiter" 513 ); 514 } 515 } 516 } 517 } else if (isEndOfFile(c)) { 518 // error condition (end of file before end of token) 519 throw new IOException( 520 "(startline " + startLineNumber + ")" 521 + "eof reached before encapsulated token finished" 522 ); 523 } else { 524 // consume character 525 tkn.content.append((char) c); 526 } 527 } 528 } 529 530 531 /** 532 * Decodes Unicode escapes. 533 * <p/> 534 * Interpretation of "\\uXXXX" escape sequences 535 * where XXXX is a hex-number. 536 * 537 * @param c current char which is discarded because it's the "\\" of "\\uXXXX" 538 * @return the decoded character 539 * @throws IOException on wrong unicode escape sequence or read error 540 */ 541 protected int unicodeEscapeLexer(int c) throws IOException { 542 int ret = 0; 543 // ignore 'u' (assume c==\ now) and read 4 hex digits 544 c = in.read(); 545 code.clear(); 546 try { 547 for (int i = 0; i < 4; i++) { 548 c = in.read(); 549 if (isEndOfFile(c) || isEndOfLine(c)) { 550 throw new NumberFormatException("number too short"); 551 } 552 code.append((char) c); 553 } 554 ret = Integer.parseInt(code.toString(), 16); 555 } catch (NumberFormatException e) { 556 throw new IOException( 557 "(line " + getLineNumber() + ") Wrong unicode escape sequence found '" 558 + code.toString() + "'" + e.toString()); 559 } 560 return ret; 561 } 562 563 private int readEscape(int c) throws IOException { 564 // assume c is the escape char (normally a backslash) 565 c = in.read(); 566 int out; 567 switch (c) { 568 case 'r': 569 out = '\r'; 570 break; 571 case 'n': 572 out = '\n'; 573 break; 574 case 't': 575 out = '\t'; 576 break; 577 case 'b': 578 out = '\b'; 579 break; 580 case 'f': 581 out = '\f'; 582 break; 583 default: 584 out = c; 585 } 586 return out; 587 } 588 589 // ====================================================== 590 // strategies 591 // ====================================================== 592 593 /** 594 * Obtain the specified CSV Strategy. This should not be modified. 595 * 596 * @return strategy currently being used 597 */ 598 public CSVStrategy getStrategy() { 599 return this.strategy; 600 } 601 602 // ====================================================== 603 // Character class checker 604 // ====================================================== 605 606 /** 607 * @return true if the given char is a whitespace character 608 */ 609 private boolean isWhitespace(int c) { 610 return Character.isWhitespace((char) c) && (c != strategy.getDelimiter()); 611 } 612 613 /** 614 * Greedy - accepts \n, \r and \r\n 615 * This checker consumes silently the second control-character... 616 * 617 * @return true if the given character is a line-terminator 618 */ 619 private boolean isEndOfLine(int c) throws IOException { 620 // check if we have \r\n... 621 if (c == '\r') { 622 if (in.lookAhead() == '\n') { 623 // note: does not change c outside of this method !! 624 c = in.read(); 625 } 626 } 627 return (c == '\n' || c == '\r'); 628 } 629 630 /** 631 * @return true if the given character indicates end of file 632 */ 633 private boolean isEndOfFile(int c) { 634 return c == ExtendedBufferedReader.END_OF_STREAM; 635 } 636 }