001 /* 002 * $Id: CharsetToolkit.java 4112 2006-10-13 13:21:25Z blackdrag $ 003 * 004 * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved. 005 * 006 * Redistribution and use of this software and associated documentation 007 * ("Software"), with or without modification, are permitted provided that the 008 * following conditions are met: 009 * 1. Redistributions of source code must retain copyright statements and 010 * notices. Redistributions must also contain a copy of this document. 011 * 2. Redistributions in binary form must reproduce the above copyright 012 * notice, this list of conditions and the following disclaimer in the 013 * documentation and/or other materials provided with the distribution. 014 * 3. The name "groovy" must not be used to endorse or promote products 015 * derived from this Software without prior written permission of The Codehaus. 016 * For written permission, please contact info@codehaus.org. 017 * 4. Products derived from this Software may not be called "groovy" nor may 018 * "groovy" appear in their names without prior written permission of The 019 * Codehaus. "groovy" is a registered trademark of The Codehaus. 020 * 5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/ 021 * 022 * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY 023 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 024 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 025 * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR 026 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 027 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 028 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 029 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 030 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 031 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 032 * DAMAGE. 033 * 034 */ 035 036 package groovy.util; 037 038 import java.io.*; 039 import java.nio.charset.Charset; 040 import java.util.*; 041 042 /** 043 * <p>Utility class to guess the encoding of a given text file.</p> 044 * 045 * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files 046 * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer 047 * is wide enough, the charset should also be discovered.</p> 048 * 049 * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p> 050 * 051 * <p>Usage:</p> 052 * <pre> 053 * // guess the encoding 054 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096); 055 * 056 * // create a reader with the correct charset 057 * CharsetToolkit toolkit = new CharsetToolkit(file); 058 * BufferedReader reader = toolkit.getReader(); 059 * 060 * // read the file content 061 * String line; 062 * while ((line = br.readLine())!= null) 063 * { 064 * System.out.println(line); 065 * } 066 * </pre> 067 * 068 * @author Guillaume Laforge 069 */ 070 public class CharsetToolkit { 071 private byte[] buffer; 072 private Charset defaultCharset; 073 private Charset charset; 074 private boolean enforce8Bit = true; 075 private File file; 076 077 /** 078 * Constructor of the <code>CharsetToolkit</code> utility class. 079 * 080 * @param file of which we want to know the encoding. 081 */ 082 public CharsetToolkit(File file) throws IOException { 083 this.file = file; 084 this.defaultCharset = getDefaultSystemCharset(); 085 this.charset = null; 086 InputStream input = new FileInputStream(file); 087 try { 088 byte[] bytes = new byte[4096]; 089 int bytesRead = input.read(bytes); 090 if (bytesRead == -1) { 091 this.buffer = new byte[0]; 092 } 093 else if (bytesRead < 4096) { 094 byte[] bytesToGuess = new byte[bytesRead]; 095 System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead); 096 this.buffer = bytesToGuess; 097 } 098 else { 099 this.buffer = bytes; 100 } 101 } finally { 102 try {input.close();} catch (IOException e){} 103 } 104 } 105 106 /** 107 * Defines the default <code>Charset</code> used in case the buffer represents 108 * an 8-bit <code>Charset</code>. 109 * 110 * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code> 111 * if an 8-bit <code>Charset</code> is encountered. 112 */ 113 public void setDefaultCharset(Charset defaultCharset) { 114 if (defaultCharset != null) 115 this.defaultCharset = defaultCharset; 116 else 117 this.defaultCharset = getDefaultSystemCharset(); 118 } 119 120 public Charset getCharset() { 121 if (this.charset == null) 122 this.charset = guessEncoding(); 123 return charset; 124 } 125 126 /** 127 * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII. 128 * It might be a file without any special character in the range 128-255, but that may be or become 129 * a file encoded with the default <code>charset</code> rather than US-ASCII. 130 * 131 * @param enforce a boolean specifying the use or not of US-ASCII. 132 */ 133 public void setEnforce8Bit(boolean enforce) { 134 this.enforce8Bit = enforce; 135 } 136 137 /** 138 * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding. 139 * 140 * @return a boolean representing the flag of use of US-ASCII. 141 */ 142 public boolean getEnforce8Bit() { 143 return this.enforce8Bit; 144 } 145 146 /** 147 * Retrieves the default Charset 148 */ 149 public Charset getDefaultCharset() { 150 return defaultCharset; 151 } 152 153 /** 154 * <p>Guess the encoding of the provided buffer.</p> 155 * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately 156 * return the charset implied by this BOM. Otherwise, the file would not be a human 157 * readable text file.</p> 158 * 159 * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not. 160 * If it is not UTF-8, we assume the encoding is the default system encoding 161 * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p> 162 * 163 * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p> 164 * <pre> 165 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 166 * 0000 0000-0000 007F 0xxxxxxx 167 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 168 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 169 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 170 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 171 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 172 * </pre> 173 * <p>With UTF-8, 0xFE and 0xFF never appear.</p> 174 * 175 * @return the Charset recognized. 176 */ 177 private Charset guessEncoding() { 178 // if the file has a Byte Order Marker, we can assume the file is in UTF-xx 179 // otherwise, the file would not be human readable 180 if (hasUTF8Bom()) 181 return Charset.forName("UTF-8"); 182 if (hasUTF16LEBom()) 183 return Charset.forName("UTF-16LE"); 184 if (hasUTF16BEBom()) 185 return Charset.forName("UTF-16BE"); 186 187 // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding 188 // otherwise, the file is in US-ASCII 189 boolean highOrderBit = false; 190 191 // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid 192 // if it's not the case, we can assume the encoding is the default encoding of the system 193 boolean validU8Char = true; 194 195 // TODO the buffer is not read up to the end, but up to length - 6 196 197 int length = buffer.length; 198 int i = 0; 199 while (i < length - 6) { 200 byte b0 = buffer[i]; 201 byte b1 = buffer[i + 1]; 202 byte b2 = buffer[i + 2]; 203 byte b3 = buffer[i + 3]; 204 byte b4 = buffer[i + 4]; 205 byte b5 = buffer[i + 5]; 206 if (b0 < 0) { 207 // a high order bit was encountered, thus the encoding is not US-ASCII 208 // it may be either an 8-bit encoding or UTF-8 209 highOrderBit = true; 210 // a two-bytes sequence was encoutered 211 if (isTwoBytesSequence(b0)) { 212 // there must be one continuation byte of the form 10xxxxxx, 213 // otherwise the following characteris is not a valid UTF-8 construct 214 if (!isContinuationChar(b1)) 215 validU8Char = false; 216 else 217 i++; 218 } 219 // a three-bytes sequence was encoutered 220 else if (isThreeBytesSequence(b0)) { 221 // there must be two continuation bytes of the form 10xxxxxx, 222 // otherwise the following characteris is not a valid UTF-8 construct 223 if (!(isContinuationChar(b1) && isContinuationChar(b2))) 224 validU8Char = false; 225 else 226 i += 2; 227 } 228 // a four-bytes sequence was encoutered 229 else if (isFourBytesSequence(b0)) { 230 // there must be three continuation bytes of the form 10xxxxxx, 231 // otherwise the following characteris is not a valid UTF-8 construct 232 if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) 233 validU8Char = false; 234 else 235 i += 3; 236 } 237 // a five-bytes sequence was encoutered 238 else if (isFiveBytesSequence(b0)) { 239 // there must be four continuation bytes of the form 10xxxxxx, 240 // otherwise the following characteris is not a valid UTF-8 construct 241 if (!(isContinuationChar(b1) 242 && isContinuationChar(b2) 243 && isContinuationChar(b3) 244 && isContinuationChar(b4))) 245 validU8Char = false; 246 else 247 i += 4; 248 } 249 // a six-bytes sequence was encoutered 250 else if (isSixBytesSequence(b0)) { 251 // there must be five continuation bytes of the form 10xxxxxx, 252 // otherwise the following characteris is not a valid UTF-8 construct 253 if (!(isContinuationChar(b1) 254 && isContinuationChar(b2) 255 && isContinuationChar(b3) 256 && isContinuationChar(b4) 257 && isContinuationChar(b5))) 258 validU8Char = false; 259 else 260 i += 5; 261 } 262 else 263 validU8Char = false; 264 } 265 if (!validU8Char) 266 break; 267 i++; 268 } 269 // if no byte with an high order bit set, the encoding is US-ASCII 270 // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) 271 if (!highOrderBit) { 272 // returns the default charset rather than US-ASCII if the enforce8Bit flag is set. 273 if (this.enforce8Bit) 274 return this.defaultCharset; 275 else 276 return Charset.forName("US-ASCII"); 277 } 278 // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, 279 // otherwise the file would not be human readable 280 if (validU8Char) 281 return Charset.forName("UTF-8"); 282 // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding 283 return this.defaultCharset; 284 } 285 286 /** 287 * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character; 288 * 289 * @param b a byte. 290 * @return true if it's a continuation char. 291 */ 292 private static boolean isContinuationChar(byte b) { 293 return -128 <= b && b <= -65; 294 } 295 296 /** 297 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character. 298 * 299 * @param b a byte. 300 * @return true if it's the first byte of a two-bytes sequence. 301 */ 302 private static boolean isTwoBytesSequence(byte b) { 303 return -64 <= b && b <= -33; 304 } 305 306 /** 307 * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character. 308 * 309 * @param b a byte. 310 * @return true if it's the first byte of a three-bytes sequence. 311 */ 312 private static boolean isThreeBytesSequence(byte b) { 313 return -32 <= b && b <= -17; 314 } 315 316 /** 317 * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character. 318 * 319 * @param b a byte. 320 * @return true if it's the first byte of a four-bytes sequence. 321 */ 322 private static boolean isFourBytesSequence(byte b) { 323 return -16 <= b && b <= -9; 324 } 325 326 /** 327 * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character. 328 * 329 * @param b a byte. 330 * @return true if it's the first byte of a five-bytes sequence. 331 */ 332 private static boolean isFiveBytesSequence(byte b) { 333 return -8 <= b && b <= -5; 334 } 335 336 /** 337 * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character. 338 * 339 * @param b a byte. 340 * @return true if it's the first byte of a six-bytes sequence. 341 */ 342 private static boolean isSixBytesSequence(byte b) { 343 return -4 <= b && b <= -3; 344 } 345 346 /** 347 * Retrieve the default charset of the system. 348 * 349 * @return the default <code>Charset</code>. 350 */ 351 public static Charset getDefaultSystemCharset() { 352 return Charset.forName(System.getProperty("file.encoding")); 353 } 354 355 /** 356 * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors). 357 * 358 * @return true if the buffer has a BOM for UTF8. 359 */ 360 public boolean hasUTF8Bom() { 361 if (buffer.length >= 3) 362 return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65); 363 else 364 return false; 365 } 366 367 /** 368 * Has a Byte Order Marker for UTF-16 Low Endian 369 * (ucs-2le, ucs-4le, and ucs-16le). 370 * 371 * @return true if the buffer has a BOM for UTF-16 Low Endian. 372 */ 373 public boolean hasUTF16LEBom() { 374 if (buffer.length >= 2) 375 return (buffer[0] == -1 && buffer[1] == -2); 376 else 377 return false; 378 } 379 380 /** 381 * Has a Byte Order Marker for UTF-16 Big Endian 382 * (utf-16 and ucs-2). 383 * 384 * @return true if the buffer has a BOM for UTF-16 Big Endian. 385 */ 386 public boolean hasUTF16BEBom() { 387 if (buffer.length >= 2) 388 return (buffer[0] == -2 && buffer[1] == -1); 389 else 390 return false; 391 } 392 393 /** 394 * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code> 395 * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the 396 * method <code>guessEncoding()</code>. 397 * 398 * @return a <code>BufferedReader</code> 399 * @throws FileNotFoundException if the file is not found. 400 */ 401 public BufferedReader getReader() throws FileNotFoundException { 402 LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset())); 403 if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) { 404 try { 405 reader.read(); 406 } 407 catch (IOException e) { 408 // should never happen, as a file with no content 409 // but with a BOM has at least one char 410 } 411 } 412 return reader; 413 } 414 415 /** 416 * Retrieves all the available <code>Charset</code>s on the platform, 417 * among which the default <code>charset</code>. 418 * 419 * @return an array of <code>Charset</code>s. 420 */ 421 public static Charset[] getAvailableCharsets() { 422 Collection collection = Charset.availableCharsets().values(); 423 return (Charset[]) collection.toArray(new Charset[collection.size()]); 424 } 425 }