001 /* 002 * $Id: CharsetToolkit.java,v 1.2 2004/07/11 19:41:25 glaforge Exp $ 003 * 004 * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved. 005 * 006 * Redistribution and use of this software and associated documentation 007 * ("Software"), with or without modification, are permitted provided that the 008 * following conditions are met: 009 * 1. Redistributions of source code must retain copyright statements and 010 * notices. Redistributions must also contain a copy of this document. 011 * 2. Redistributions in binary form must reproduce the above copyright 012 * notice, this list of conditions and the following disclaimer in the 013 * documentation and/or other materials provided with the distribution. 014 * 3. The name "groovy" must not be used to endorse or promote products 015 * derived from this Software without prior written permission of The Codehaus. 016 * For written permission, please contact info@codehaus.org. 017 * 4. Products derived from this Software may not be called "groovy" nor may 018 * "groovy" appear in their names without prior written permission of The 019 * Codehaus. "groovy" is a registered trademark of The Codehaus. 020 * 5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/ 021 * 022 * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY 023 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 024 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 025 * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR 026 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 027 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 028 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 029 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 030 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 031 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 032 * DAMAGE. 033 * 034 */ 035 036 package groovy.util; 037 038 import java.io.*; 039 import java.nio.charset.Charset; 040 import java.util.*; 041 042 /** 043 * <p>Utility class to guess the encoding of a given text file.</p> 044 * 045 * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files 046 * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer 047 * is wide enough, the charset should also be discovered.</p> 048 * 049 * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p> 050 * 051 * <p>Usage:</p> 052 * <pre> 053 * // guess the encoding 054 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096); 055 * 056 * // create a reader with the correct charset 057 * CharsetToolkit toolkit = new CharsetToolkit(file); 058 * BufferedReader reader = toolkit.getReader(); 059 * 060 * // read the file content 061 * String line; 062 * while ((line = br.readLine())!= null) 063 * { 064 * System.out.println(line); 065 * } 066 * </pre> 067 * 068 * @author Guillaume Laforge 069 */ 070 public class CharsetToolkit { 071 private byte[] buffer; 072 private Charset defaultCharset; 073 private Charset charset; 074 private boolean enforce8Bit = true; 075 private File file; 076 077 /** 078 * Constructor of the <code>CharsetToolkit</code> utility class. 079 * 080 * @param file of which we want to know the encoding. 081 */ 082 public CharsetToolkit(File file) throws IOException { 083 this.file = file; 084 InputStream input = new FileInputStream(file); 085 byte[] bytes = new byte[4096]; 086 int bytesRead = input.read(bytes); 087 if (bytesRead == -1) { 088 this.buffer = new byte[0]; 089 } 090 else if (bytesRead < 4096) { 091 byte[] bytesToGuess = new byte[bytesRead]; 092 System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead); 093 this.buffer = bytesToGuess; 094 } 095 else { 096 this.buffer = bytes; 097 } 098 this.defaultCharset = getDefaultSystemCharset(); 099 this.charset = null; 100 } 101 102 /** 103 * Defines the default <code>Charset</code> used in case the buffer represents 104 * an 8-bit <code>Charset</code>. 105 * 106 * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code> 107 * if an 8-bit <code>Charset</code> is encountered. 108 */ 109 public void setDefaultCharset(Charset defaultCharset) { 110 if (defaultCharset != null) 111 this.defaultCharset = defaultCharset; 112 else 113 this.defaultCharset = getDefaultSystemCharset(); 114 } 115 116 public Charset getCharset() { 117 if (this.charset == null) 118 this.charset = guessEncoding(); 119 return charset; 120 } 121 122 /** 123 * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII. 124 * It might be a file without any special character in the range 128-255, but that may be or become 125 * a file encoded with the default <code>charset</code> rather than US-ASCII. 126 * 127 * @param enforce a boolean specifying the use or not of US-ASCII. 128 */ 129 public void setEnforce8Bit(boolean enforce) { 130 this.enforce8Bit = enforce; 131 } 132 133 /** 134 * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding. 135 * 136 * @return a boolean representing the flag of use of US-ASCII. 137 */ 138 public boolean getEnforce8Bit() { 139 return this.enforce8Bit; 140 } 141 142 /** 143 * Retrieves the default Charset 144 * @return 145 */ 146 public Charset getDefaultCharset() { 147 return defaultCharset; 148 } 149 150 /** 151 * <p>Guess the encoding of the provided buffer.</p> 152 * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately 153 * return the charset implied by this BOM. Otherwise, the file would not be a human 154 * readable text file.</p> 155 * 156 * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not. 157 * If it is not UTF-8, we assume the encoding is the default system encoding 158 * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p> 159 * 160 * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p> 161 * <pre> 162 * UCS-4 range (hex.) UTF-8 octet sequence (binary) 163 * 0000 0000-0000 007F 0xxxxxxx 164 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 165 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 166 * 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 167 * 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 168 * 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 169 * </pre> 170 * <p>With UTF-8, 0xFE and 0xFF never appear.</p> 171 * 172 * @return the Charset recognized. 173 */ 174 private Charset guessEncoding() { 175 // if the file has a Byte Order Marker, we can assume the file is in UTF-xx 176 // otherwise, the file would not be human readable 177 if (hasUTF8Bom()) 178 return Charset.forName("UTF-8"); 179 if (hasUTF16LEBom()) 180 return Charset.forName("UTF-16LE"); 181 if (hasUTF16BEBom()) 182 return Charset.forName("UTF-16BE"); 183 184 // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding 185 // otherwise, the file is in US-ASCII 186 boolean highOrderBit = false; 187 188 // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid 189 // if it's not the case, we can assume the encoding is the default encoding of the system 190 boolean validU8Char = true; 191 192 // TODO the buffer is not read up to the end, but up to length - 6 193 194 int length = buffer.length; 195 int i = 0; 196 while (i < length - 6) { 197 byte b0 = buffer[i]; 198 byte b1 = buffer[i + 1]; 199 byte b2 = buffer[i + 2]; 200 byte b3 = buffer[i + 3]; 201 byte b4 = buffer[i + 4]; 202 byte b5 = buffer[i + 5]; 203 if (b0 < 0) { 204 // a high order bit was encountered, thus the encoding is not US-ASCII 205 // it may be either an 8-bit encoding or UTF-8 206 highOrderBit = true; 207 // a two-bytes sequence was encoutered 208 if (isTwoBytesSequence(b0)) { 209 // there must be one continuation byte of the form 10xxxxxx, 210 // otherwise the following characteris is not a valid UTF-8 construct 211 if (!isContinuationChar(b1)) 212 validU8Char = false; 213 else 214 i++; 215 } 216 // a three-bytes sequence was encoutered 217 else if (isThreeBytesSequence(b0)) { 218 // there must be two continuation bytes of the form 10xxxxxx, 219 // otherwise the following characteris is not a valid UTF-8 construct 220 if (!(isContinuationChar(b1) && isContinuationChar(b2))) 221 validU8Char = false; 222 else 223 i += 2; 224 } 225 // a four-bytes sequence was encoutered 226 else if (isFourBytesSequence(b0)) { 227 // there must be three continuation bytes of the form 10xxxxxx, 228 // otherwise the following characteris is not a valid UTF-8 construct 229 if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3))) 230 validU8Char = false; 231 else 232 i += 3; 233 } 234 // a five-bytes sequence was encoutered 235 else if (isFiveBytesSequence(b0)) { 236 // there must be four continuation bytes of the form 10xxxxxx, 237 // otherwise the following characteris is not a valid UTF-8 construct 238 if (!(isContinuationChar(b1) 239 && isContinuationChar(b2) 240 && isContinuationChar(b3) 241 && isContinuationChar(b4))) 242 validU8Char = false; 243 else 244 i += 4; 245 } 246 // a six-bytes sequence was encoutered 247 else if (isSixBytesSequence(b0)) { 248 // there must be five continuation bytes of the form 10xxxxxx, 249 // otherwise the following characteris is not a valid UTF-8 construct 250 if (!(isContinuationChar(b1) 251 && isContinuationChar(b2) 252 && isContinuationChar(b3) 253 && isContinuationChar(b4) 254 && isContinuationChar(b5))) 255 validU8Char = false; 256 else 257 i += 5; 258 } 259 else 260 validU8Char = false; 261 } 262 if (!validU8Char) 263 break; 264 i++; 265 } 266 // if no byte with an high order bit set, the encoding is US-ASCII 267 // (it might have been UTF-7, but this encoding is usually internally used only by mail systems) 268 if (!highOrderBit) { 269 // returns the default charset rather than US-ASCII if the enforce8Bit flag is set. 270 if (this.enforce8Bit) 271 return this.defaultCharset; 272 else 273 return Charset.forName("US-ASCII"); 274 } 275 // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8, 276 // otherwise the file would not be human readable 277 if (validU8Char) 278 return Charset.forName("UTF-8"); 279 // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding 280 return this.defaultCharset; 281 } 282 283 /** 284 * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character; 285 * 286 * @param b a byte. 287 * @return true if it's a continuation char. 288 */ 289 private static boolean isContinuationChar(byte b) { 290 return -128 <= b && b <= -65; 291 } 292 293 /** 294 * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character. 295 * 296 * @param b a byte. 297 * @return true if it's the first byte of a two-bytes sequence. 298 */ 299 private static boolean isTwoBytesSequence(byte b) { 300 return -64 <= b && b <= -33; 301 } 302 303 /** 304 * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character. 305 * 306 * @param b a byte. 307 * @return true if it's the first byte of a three-bytes sequence. 308 */ 309 private static boolean isThreeBytesSequence(byte b) { 310 return -32 <= b && b <= -17; 311 } 312 313 /** 314 * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character. 315 * 316 * @param b a byte. 317 * @return true if it's the first byte of a four-bytes sequence. 318 */ 319 private static boolean isFourBytesSequence(byte b) { 320 return -16 <= b && b <= -9; 321 } 322 323 /** 324 * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character. 325 * 326 * @param b a byte. 327 * @return true if it's the first byte of a five-bytes sequence. 328 */ 329 private static boolean isFiveBytesSequence(byte b) { 330 return -8 <= b && b <= -5; 331 } 332 333 /** 334 * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character. 335 * 336 * @param b a byte. 337 * @return true if it's the first byte of a six-bytes sequence. 338 */ 339 private static boolean isSixBytesSequence(byte b) { 340 return -4 <= b && b <= -3; 341 } 342 343 /** 344 * Retrieve the default charset of the system. 345 * 346 * @return the default <code>Charset</code>. 347 */ 348 public static Charset getDefaultSystemCharset() { 349 return Charset.forName(System.getProperty("file.encoding")); 350 } 351 352 /** 353 * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors). 354 * 355 * @return true if the buffer has a BOM for UTF8. 356 */ 357 public boolean hasUTF8Bom() { 358 if (buffer.length >= 3) 359 return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65); 360 else 361 return false; 362 } 363 364 /** 365 * Has a Byte Order Marker for UTF-16 Low Endian 366 * (ucs-2le, ucs-4le, and ucs-16le). 367 * 368 * @return true if the buffer has a BOM for UTF-16 Low Endian. 369 */ 370 public boolean hasUTF16LEBom() { 371 if (buffer.length >= 2) 372 return (buffer[0] == -1 && buffer[1] == -2); 373 else 374 return false; 375 } 376 377 /** 378 * Has a Byte Order Marker for UTF-16 Big Endian 379 * (utf-16 and ucs-2). 380 * 381 * @return true if the buffer has a BOM for UTF-16 Big Endian. 382 */ 383 public boolean hasUTF16BEBom() { 384 if (buffer.length >= 2) 385 return (buffer[0] == -2 && buffer[1] == -1); 386 else 387 return false; 388 } 389 390 /** 391 * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code> 392 * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the 393 * method <code>guessEncoding()</code>. 394 * 395 * @return a <code>BufferedReader</code> 396 * @throws FileNotFoundException if the file is not found. 397 */ 398 public BufferedReader getReader() throws FileNotFoundException { 399 LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset())); 400 if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) { 401 try { 402 reader.read(); 403 } 404 catch (IOException e) { 405 // should never happen, as a file with no content 406 // but with a BOM has at least one char 407 } 408 } 409 return reader; 410 } 411 412 /** 413 * Retrieves all the available <code>Charset</code>s on the platform, 414 * among which the default <code>charset</code>. 415 * 416 * @return an array of <code>Charset</code>s. 417 */ 418 public static Charset[] getAvailableCharsets() { 419 Collection collection = Charset.availableCharsets().values(); 420 return (Charset[]) collection.toArray(new Charset[collection.size()]); 421 } 422 }