001    /*
002     * $Id: CharsetToolkit.java 4112 2006-10-13 13:21:25Z blackdrag $
003     *
004     * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
005     *
006     * Redistribution and use of this software and associated documentation
007     * ("Software"), with or without modification, are permitted provided that the
008     * following conditions are met:
009     *  1. Redistributions of source code must retain copyright statements and
010     * notices. Redistributions must also contain a copy of this document.
011     *  2. Redistributions in binary form must reproduce the above copyright
012     * notice, this list of conditions and the following disclaimer in the
013     * documentation and/or other materials provided with the distribution.
014     *  3. The name "groovy" must not be used to endorse or promote products
015     * derived from this Software without prior written permission of The Codehaus.
016     * For written permission, please contact info@codehaus.org.
017     *  4. Products derived from this Software may not be called "groovy" nor may
018     * "groovy" appear in their names without prior written permission of The
019     * Codehaus. "groovy" is a registered trademark of The Codehaus.
020     *  5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
021     *
022     * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
023     * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
024     * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
025     * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
026     * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
027     * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
028     * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
029     * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
030     * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
031     * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
032     * DAMAGE.
033     *
034     */
035    
036    package groovy.util;
037    
038    import java.io.*;
039    import java.nio.charset.Charset;
040    import java.util.*;
041    
042    /**
043     * <p>Utility class to guess the encoding of a given text file.</p>
044     *
045     * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
046     * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
047     * is wide enough, the charset should also be discovered.</p>
048     *
049     * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
050     *
051     * <p>Usage:</p>
052     * <pre>
053     * // guess the encoding
054     * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
055     *
056     * // create a reader with the correct charset
057     * CharsetToolkit toolkit = new CharsetToolkit(file);
058     * BufferedReader reader = toolkit.getReader();
059     *
060     * // read the file content
061     * String line;
062     * while ((line = br.readLine())!= null)
063     * {
064     *     System.out.println(line);
065     * }
066     * </pre>
067     *
068     * @author Guillaume Laforge
069     */
070    public class CharsetToolkit {
071        private byte[] buffer;
072        private Charset defaultCharset;
073        private Charset charset;
074        private boolean enforce8Bit = true;
075        private File file;
076    
077        /**
078         * Constructor of the <code>CharsetToolkit</code> utility class.
079         *
080         * @param file of which we want to know the encoding.
081         */
082        public CharsetToolkit(File file) throws IOException {
083            this.file = file;
084            this.defaultCharset = getDefaultSystemCharset();
085            this.charset = null;
086            InputStream input = new FileInputStream(file);
087            try {
088                byte[] bytes = new byte[4096];
089                int bytesRead = input.read(bytes);
090                if (bytesRead == -1) {
091                    this.buffer = new byte[0];
092                }
093                else if (bytesRead < 4096) {
094                    byte[] bytesToGuess = new byte[bytesRead];
095                    System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
096                    this.buffer = bytesToGuess;
097                }
098                else {
099                    this.buffer = bytes;
100                }
101            } finally {
102                try {input.close();} catch (IOException e){}
103            }
104        }
105    
106        /**
107         * Defines the default <code>Charset</code> used in case the buffer represents
108         * an 8-bit <code>Charset</code>.
109         *
110         * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
111         * if an 8-bit <code>Charset</code> is encountered.
112         */
113        public void setDefaultCharset(Charset defaultCharset) {
114            if (defaultCharset != null)
115                this.defaultCharset = defaultCharset;
116            else
117                this.defaultCharset = getDefaultSystemCharset();
118        }
119    
120        public Charset getCharset() {
121            if (this.charset == null)
122                this.charset = guessEncoding();
123            return charset;
124        }
125    
126        /**
127         * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
128         * It might be a file without any special character in the range 128-255, but that may be or become
129         * a file encoded with the default <code>charset</code> rather than US-ASCII.
130         *
131         * @param enforce a boolean specifying the use or not of US-ASCII.
132         */
133        public void setEnforce8Bit(boolean enforce) {
134            this.enforce8Bit = enforce;
135        }
136    
137        /**
138         * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
139         *
140         * @return a boolean representing the flag of use of US-ASCII.
141         */
142        public boolean getEnforce8Bit() {
143            return this.enforce8Bit;
144        }
145    
146        /**
147         * Retrieves the default Charset
148         */
149        public Charset getDefaultCharset() {
150            return defaultCharset;
151        }
152    
153        /**
154         * <p>Guess the encoding of the provided buffer.</p>
155         * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
156         * return the charset implied by this BOM. Otherwise, the file would not be a human
157         * readable text file.</p>
158         *
159         * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
160         * If it is not UTF-8, we assume the encoding is the default system encoding
161         * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
162         *
163         * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
164         * <pre>
165         * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
166         * 0000 0000-0000 007F       0xxxxxxx
167         * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
168         * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
169         * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
170         * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
171         * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
172         * </pre>
173         * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
174         *
175         * @return the Charset recognized.
176         */
177        private Charset guessEncoding() {
178            // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
179            // otherwise, the file would not be human readable
180            if (hasUTF8Bom())
181                return Charset.forName("UTF-8");
182            if (hasUTF16LEBom())
183                return Charset.forName("UTF-16LE");
184            if (hasUTF16BEBom())
185                return Charset.forName("UTF-16BE");
186    
187            // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
188            // otherwise, the file is in US-ASCII
189            boolean highOrderBit = false;
190    
191            // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
192            // if it's not the case, we can assume the encoding is the default encoding of the system
193            boolean validU8Char = true;
194    
195            // TODO the buffer is not read up to the end, but up to length - 6
196    
197            int length = buffer.length;
198            int i = 0;
199            while (i < length - 6) {
200                byte b0 = buffer[i];
201                byte b1 = buffer[i + 1];
202                byte b2 = buffer[i + 2];
203                byte b3 = buffer[i + 3];
204                byte b4 = buffer[i + 4];
205                byte b5 = buffer[i + 5];
206                if (b0 < 0) {
207                    // a high order bit was encountered, thus the encoding is not US-ASCII
208                    // it may be either an 8-bit encoding or UTF-8
209                    highOrderBit = true;
210                    // a two-bytes sequence was encoutered
211                    if (isTwoBytesSequence(b0)) {
212                        // there must be one continuation byte of the form 10xxxxxx,
213                        // otherwise the following characteris is not a valid UTF-8 construct
214                        if (!isContinuationChar(b1))
215                            validU8Char = false;
216                        else
217                            i++;
218                    }
219                    // a three-bytes sequence was encoutered
220                    else if (isThreeBytesSequence(b0)) {
221                        // there must be two continuation bytes of the form 10xxxxxx,
222                        // otherwise the following characteris is not a valid UTF-8 construct
223                        if (!(isContinuationChar(b1) && isContinuationChar(b2)))
224                            validU8Char = false;
225                        else
226                            i += 2;
227                    }
228                    // a four-bytes sequence was encoutered
229                    else if (isFourBytesSequence(b0)) {
230                        // there must be three continuation bytes of the form 10xxxxxx,
231                        // otherwise the following characteris is not a valid UTF-8 construct
232                        if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
233                            validU8Char = false;
234                        else
235                            i += 3;
236                    }
237                    // a five-bytes sequence was encoutered
238                    else if (isFiveBytesSequence(b0)) {
239                        // there must be four continuation bytes of the form 10xxxxxx,
240                        // otherwise the following characteris is not a valid UTF-8 construct
241                        if (!(isContinuationChar(b1)
242                            && isContinuationChar(b2)
243                            && isContinuationChar(b3)
244                            && isContinuationChar(b4)))
245                            validU8Char = false;
246                        else
247                            i += 4;
248                    }
249                    // a six-bytes sequence was encoutered
250                    else if (isSixBytesSequence(b0)) {
251                        // there must be five continuation bytes of the form 10xxxxxx,
252                        // otherwise the following characteris is not a valid UTF-8 construct
253                        if (!(isContinuationChar(b1)
254                            && isContinuationChar(b2)
255                            && isContinuationChar(b3)
256                            && isContinuationChar(b4)
257                            && isContinuationChar(b5)))
258                            validU8Char = false;
259                        else
260                            i += 5;
261                    }
262                    else
263                        validU8Char = false;
264                }
265                if (!validU8Char)
266                    break;
267                i++;
268            }
269            // if no byte with an high order bit set, the encoding is US-ASCII
270            // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
271            if (!highOrderBit) {
272                // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
273                if (this.enforce8Bit)
274                    return this.defaultCharset;
275                else
276                    return Charset.forName("US-ASCII");
277            }
278            // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
279            // otherwise the file would not be human readable
280            if (validU8Char)
281                return Charset.forName("UTF-8");
282            // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
283            return this.defaultCharset;
284        }
285    
286        /**
287         * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
288         *
289         * @param b a byte.
290         * @return true if it's a continuation char.
291         */
292        private static boolean isContinuationChar(byte b) {
293            return -128 <= b && b <= -65;
294        }
295    
296        /**
297         * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
298         *
299         * @param b a byte.
300         * @return true if it's the first byte of a two-bytes sequence.
301         */
302        private static boolean isTwoBytesSequence(byte b) {
303            return -64 <= b && b <= -33;
304        }
305    
306        /**
307         * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
308         *
309         * @param b a byte.
310         * @return true if it's the first byte of a three-bytes sequence.
311         */
312        private static boolean isThreeBytesSequence(byte b) {
313            return -32 <= b && b <= -17;
314        }
315    
316        /**
317         * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
318         *
319         * @param b a byte.
320         * @return true if it's the first byte of a four-bytes sequence.
321         */
322        private static boolean isFourBytesSequence(byte b) {
323            return -16 <= b && b <= -9;
324        }
325    
326        /**
327         * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
328         *
329         * @param b a byte.
330         * @return true if it's the first byte of a five-bytes sequence.
331         */
332        private static boolean isFiveBytesSequence(byte b) {
333            return -8 <= b && b <= -5;
334        }
335    
336        /**
337         * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
338         *
339         * @param b a byte.
340         * @return true if it's the first byte of a six-bytes sequence.
341         */
342        private static boolean isSixBytesSequence(byte b) {
343            return -4 <= b && b <= -3;
344        }
345    
346        /**
347         * Retrieve the default charset of the system.
348         *
349         * @return the default <code>Charset</code>.
350         */
351        public static Charset getDefaultSystemCharset() {
352            return Charset.forName(System.getProperty("file.encoding"));
353        }
354    
355        /**
356         * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
357         *
358         * @return true if the buffer has a BOM for UTF8.
359         */
360        public boolean hasUTF8Bom() {
361            if (buffer.length >= 3)
362                return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
363            else
364                return false;
365        }
366    
367        /**
368         * Has a Byte Order Marker for UTF-16 Low Endian
369         * (ucs-2le, ucs-4le, and ucs-16le).
370         *
371         * @return true if the buffer has a BOM for UTF-16 Low Endian.
372         */
373        public boolean hasUTF16LEBom() {
374            if (buffer.length >= 2)
375                return (buffer[0] == -1 && buffer[1] == -2);
376            else
377                return false;
378        }
379    
380        /**
381         * Has a Byte Order Marker for UTF-16 Big Endian
382         * (utf-16 and ucs-2).
383         *
384         * @return true if the buffer has a BOM for UTF-16 Big Endian.
385         */
386        public boolean hasUTF16BEBom() {
387            if (buffer.length >= 2)
388                return (buffer[0] == -2 && buffer[1] == -1);
389            else
390                return false;
391        }
392    
393        /**
394         * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
395         * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
396         * method <code>guessEncoding()</code>.
397         *
398         * @return a <code>BufferedReader</code>
399         * @throws FileNotFoundException if the file is not found.
400         */
401        public BufferedReader getReader() throws FileNotFoundException {
402            LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
403            if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
404                try {
405                    reader.read();
406                }
407                catch (IOException e) {
408                    // should never happen, as a file with no content
409                    // but with a BOM has at least one char
410                }
411            }
412            return reader;
413        }
414    
415        /**
416         * Retrieves all the available <code>Charset</code>s on the platform,
417         * among which the default <code>charset</code>.
418         *
419         * @return an array of <code>Charset</code>s.
420         */
421        public static Charset[] getAvailableCharsets() {
422            Collection collection = Charset.availableCharsets().values();
423            return (Charset[]) collection.toArray(new Charset[collection.size()]);
424        }
425    }