View Javadoc

1   /**
2   *******************************************************************************
3   * Copyright (C) 2005-2009, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package org.apache.tika.parser.txt;
8   
9   import java.io.InputStream;
10  import java.io.Reader;
11  import java.io.IOException;
12  import java.nio.charset.Charset;
13  import java.util.ArrayList;
14  import java.util.Collections;
15  import java.util.Arrays;
16  
17  
18  /**
19   * <code>CharsetDetector</code> provides a facility for detecting the
20   * charset or encoding of character data in an unknown format.
21   * The input data can either be from an input stream or an array of bytes.
22   * The result of the detection operation is a list of possibly matching
23   * charsets, or, for simple use, you can just ask for a Java Reader that
24   * will will work over the input data.
25   * <p/>
26   * Character set detection is at best an imprecise operation.  The detection
27   * process will attempt to identify the charset that best matches the characteristics
28   * of the byte data, but the process is partly statistical in nature, and
29   * the results can not be guaranteed to always be correct.
30   * <p/>
31   * For best accuracy in charset detection, the input data should be primarily
32   * in a single language, and a minimum of a few hundred bytes worth of plain text
33   * in the language are needed.  The detection process will attempt to
34   * ignore html or xml style markup that could otherwise obscure the content.
35   * <p/>
36   * @stable ICU 3.4
37   */
38  public class CharsetDetector {
39  
40  //   Question: Should we have getters corresponding to the setters for inut text
41  //   and declared encoding?
42  
43  //   A thought: If we were to create our own type of Java Reader, we could defer
44  //   figuring out an actual charset for data that starts out with too much English
45  //   only ASCII until the user actually read through to something that didn't look
46  //   like 7 bit English.  If  nothing else ever appeared, we would never need to
47  //   actually choose the "real" charset.  All assuming that the application just
48  //   wants the data, and doesn't care about a char set name.
49  
50      /**
51       *   Constructor
52       * 
53       * @stable ICU 3.4
54       */
55      public CharsetDetector() {
56      }
57  
58      /**
59       * Set the declared encoding for charset detection.
60       *  The declared encoding of an input text is an encoding obtained
61       *  from an http header or xml declaration or similar source that
62       *  can be provided as additional information to the charset detector.  
63       *  A match between a declared encoding and a possible detected encoding
64       *  will raise the quality of that detected encoding by a small delta,
65       *  and will also appear as a "reason" for the match.
66       * <p/>
67       * A declared encoding that is incompatible with the input data being
68       * analyzed will not be added to the list of possible encodings.
69       * 
70       *  @param encoding The declared encoding 
71       *
72       * @stable ICU 3.4
73       */
74      public CharsetDetector setDeclaredEncoding(String encoding) {
75          setCanonicalDeclaredEncoding(encoding);
76          return this;
77      }
78      
79      /**
80       * Set the input text (byte) data whose charset is to be detected.
81       * 
82       * @param in the input text of unknown encoding
83       * 
84       * @return This CharsetDetector
85       *
86       * @stable ICU 3.4
87       */
88      public CharsetDetector setText(byte [] in) {
89          fRawInput  = in;
90          fRawLength = in.length;
91          
92          MungeInput();
93          
94          return this;
95      }
96      
97      private static final int kBufSize = 8000;
98  
99      private static final int MAX_CONFIDENCE = 100;
100 
101     /**
102      * Set the input text (byte) data whose charset is to be detected.
103      *  <p/>
104      *   The input stream that supplies the character data must have markSupported()
105      *   == true; the charset detection process will read a small amount of data,
106      *   then return the stream to its original position via
107      *   the InputStream.reset() operation.  The exact amount that will
108      *   be read depends on the characteristics of the data itself.
109      *
110      * @param in the input text of unknown encoding
111      * 
112      * @return This CharsetDetector
113      *
114      * @stable ICU 3.4
115      */
116     
117     public CharsetDetector setText(InputStream in) throws IOException {
118         fInputStream = in;
119         fInputStream.mark(kBufSize);
120         fRawInput = new byte[kBufSize];   // Always make a new buffer because the
121                                           //   previous one may have come from the caller,
122                                           //   in which case we can't touch it.
123         fRawLength = 0;
124         int remainingLength = kBufSize;
125         while (remainingLength > 0 ) {
126             // read() may give data in smallish chunks, esp. for remote sources.  Hence, this loop.
127             int  bytesRead = fInputStream.read(fRawInput, fRawLength, remainingLength);
128             if (bytesRead <= 0) {
129                  break;
130             }
131             fRawLength += bytesRead;
132             remainingLength -= bytesRead;
133         }
134         fInputStream.reset();
135         
136         MungeInput();                     // Strip html markup, collect byte stats.
137         return this;
138     }
139 
140   
141     /**
142      * Return the charset that best matches the supplied input data.
143      * 
144      * Note though, that because the detection 
145      * only looks at the start of the input data,
146      * there is a possibility that the returned charset will fail to handle
147      * the full set of input data.
148      * <p/>
149      * Raise an exception if 
150      *  <ul>
151      *    <li>no charset appears to match the data.</li>
152      *    <li>no input text has been provided</li>
153      *  </ul>
154      *
155      * @return a CharsetMatch object representing the best matching charset, or
156      *         <code>null</code> if there are no matches.
157      *
158      * @stable ICU 3.4
159      */
160     public CharsetMatch detect() {
161 //   TODO:  A better implementation would be to copy the detect loop from
162 //          detectAll(), and cut it short as soon as a match with a high confidence
163 //          is found.  This is something to be done later, after things are otherwise
164 //          working.
165         CharsetMatch matches[] = detectAll();
166         
167         if (matches == null || matches.length == 0) {
168             return null;
169         }
170         
171         return matches[0];
172      }
173     
174     /**
175      *  Return an array of all charsets that appear to be plausible
176      *  matches with the input data.  The array is ordered with the
177      *  best quality match first.
178      * <p/>
179      * Raise an exception if 
180      *  <ul>
181      *    <li>no charsets appear to match the input data.</li>
182      *    <li>no input text has been provided</li>
183      *  </ul>
184      * 
185      * @return An array of CharsetMatch objects representing possibly matching charsets.
186      *
187      * @stable ICU 3.4
188      */
189     public CharsetMatch[] detectAll() {
190         CharsetRecognizer csr;
191         int               i;
192         int               detectResults;
193         int               confidence;
194         ArrayList<CharsetMatch> matches = new ArrayList<CharsetMatch>();
195         
196         //  Iterate over all possible charsets, remember all that
197         //    give a match quality > 0.
198         for (i=0; i<fCSRecognizers.size(); i++) {
199             csr = fCSRecognizers.get(i);
200             detectResults = csr.match(this);
201             confidence = detectResults & 0x000000ff;
202             if (confidence > 0) {
203                 // Just to be safe, constrain
204                 confidence = Math.min(confidence, MAX_CONFIDENCE);
205                 
206                 // Apply charset hint.
207                 if ((fDeclaredEncoding != null) && (fDeclaredEncoding.equalsIgnoreCase(csr.getName()))) {
208                     // Reduce lack of confidence (delta between "sure" and current) by 50%.
209                     confidence += (MAX_CONFIDENCE - confidence)/2;
210                 }
211                 
212                 CharsetMatch  m = new CharsetMatch(this, csr, confidence);
213                 matches.add(m);
214             }
215         }
216         
217         Collections.sort(matches);      // CharsetMatch compares on confidence
218         Collections.reverse(matches);   //  Put best match first.
219         CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
220         resultArray = (CharsetMatch[]) matches.toArray(resultArray);
221         return resultArray;
222     }
223 
224     
225     /**
226      * Autodetect the charset of an inputStream, and return a Java Reader
227      * to access the converted input data.
228      * <p/>
229      * This is a convenience method that is equivalent to
230      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
231      * <p/>
232      *   For the input stream that supplies the character data, markSupported()
233      *   must be true; the  charset detection will read a small amount of data,
234      *   then return the stream to its original position via
235      *   the InputStream.reset() operation.  The exact amount that will
236      *    be read depends on the characteristics of the data itself.
237      *<p/>
238      * Raise an exception if no charsets appear to match the input data.
239      * 
240      * @param in The source of the byte data in the unknown charset.
241      *
242      * @param declaredEncoding  A declared encoding for the data, if available,
243      *           or null or an empty string if none is available.
244      *
245      * @stable ICU 3.4
246      */
247     public Reader getReader(InputStream in, String declaredEncoding) {
248         setCanonicalDeclaredEncoding(declaredEncoding);
249         
250         try {
251             setText(in);
252             
253             CharsetMatch match = detect();
254             
255             if (match == null) {
256                 return null;
257             }
258             
259             return match.getReader();
260         } catch (IOException e) {
261             return null;
262         }
263     }
264 
265     /**
266      * Autodetect the charset of an inputStream, and return a String
267      * containing the converted input data.
268      * <p/>
269      * This is a convenience method that is equivalent to
270      *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
271      *<p/>
272      * Raise an exception if no charsets appear to match the input data.
273      * 
274      * @param in The source of the byte data in the unknown charset.
275      *
276      * @param declaredEncoding  A declared encoding for the data, if available,
277      *           or null or an empty string if none is available.
278      *
279      * @stable ICU 3.4
280      */
281     public String getString(byte[] in, String declaredEncoding) {
282         setCanonicalDeclaredEncoding(declaredEncoding);
283        
284         try {
285             setText(in);
286             
287             CharsetMatch match = detect();
288             
289             if (match == null) {
290                 return null;
291             }
292             
293             return match.getString(-1);
294         } catch (IOException e) {
295             return null;
296         }
297     }
298 
299  
300     /**
301      * Get the names of all char sets that can be recognized by the char set detector.
302      *
303      * @return an array of the names of all charsets that can be recognized
304      * by the charset detector.
305      *
306      * @stable ICU 3.4
307      */
308     public static String[] getAllDetectableCharsets() {
309         return fCharsetNames;
310     }
311     
312     /**
313      * Test whether or not input filtering is enabled.
314      * 
315      * @return <code>true</code> if input text will be filtered.
316      * 
317      * @see #enableInputFilter
318      *
319      * @stable ICU 3.4
320      */
321     public boolean inputFilterEnabled()
322     {
323         return fStripTags;
324     }
325     
326     /**
327      * Enable filtering of input text. If filtering is enabled,
328      * text within angle brackets ("<" and ">") will be removed
329      * before detection.
330      * 
331      * @param filter <code>true</code> to enable input text filtering.
332      * 
333      * @return The previous setting.
334      *
335      * @stable ICU 3.4
336      */
337     public boolean enableInputFilter(boolean filter)
338     {
339         boolean previous = fStripTags;
340         
341         fStripTags = filter;
342         
343         return previous;
344     }
345     
346     /**
347      * Try to set fDeclaredEncoding to the canonical name for <encoding>, if it exists.
348      * 
349      * @param encoding - name of character encoding
350      */
351     private void setCanonicalDeclaredEncoding(String encoding) {
352         Charset cs = Charset.forName(encoding);
353         if (cs != null) {
354             fDeclaredEncoding = cs.name();
355         }
356     }
357     
358     /*
359      *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
360      *               it by removing what appears to be html markup.
361      */
362     private void MungeInput() {
363         int srci = 0;
364         int dsti = 0;
365         byte b;
366         boolean  inMarkup = false;
367         int      openTags = 0;
368         int      badTags  = 0;
369         
370         //
371         //  html / xml markup stripping.
372         //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
373         //     discard everything within < brackets >
374         //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
375         //     guess as to whether the input was actually marked up at all.
376         if (fStripTags) {
377             for (srci = 0; srci < fRawLength && dsti < fInputBytes.length; srci++) {
378                 b = fRawInput[srci];
379                 if (b == (byte)'<') {
380                     if (inMarkup) {
381                         badTags++;
382                     }
383                     inMarkup = true;
384                     openTags++;
385                 }
386                 
387                 if (! inMarkup) {
388                     fInputBytes[dsti++] = b;
389                 }
390                 
391                 if (b == (byte)'>') {
392                     inMarkup = false;
393                 }        
394             }
395             
396             fInputLen = dsti;
397         }
398         
399         //
400         //  If it looks like this input wasn't marked up, or if it looks like it's
401         //    essentially nothing but markup abandon the markup stripping.
402         //    Detection will have to work on the unstripped input.
403         //
404         if (openTags<5 || openTags/5 < badTags || 
405                 (fInputLen < 100 && fRawLength>600)) {
406             int limit = fRawLength;
407             
408             if (limit > kBufSize) {
409                 limit = kBufSize;
410             }
411             
412             for (srci=0; srci<limit; srci++) {
413                 fInputBytes[srci] = fRawInput[srci];
414             }
415             fInputLen = srci;
416         }
417         
418         //
419         // Tally up the byte occurence statistics.
420         //   These are available for use by the various detectors.
421         //
422         Arrays.fill(fByteStats, (short)0);
423         for (srci=0; srci<fInputLen; srci++) {
424             int val = fInputBytes[srci] & 0x00ff;
425             fByteStats[val]++;
426         }
427         
428         fC1Bytes = false;
429         for (int i = 0x80; i <= 0x9F; i += 1) {
430             if (fByteStats[i] != 0) {
431                 fC1Bytes = true;
432                 break;
433             }
434         }
435      }
436 
437     /*
438      *  The following items are accessed by individual CharsetRecongizers during
439      *     the recognition process
440      * 
441      */
442     byte[]      fInputBytes =       // The text to be checked.  Markup will have been
443                    new byte[kBufSize];  //   removed if appropriate.
444     
445     int         fInputLen;          // Length of the byte data in fInputText.
446     
447     short       fByteStats[] =      // byte frequency statistics for the input text.
448                    new short[256];  //   Value is percent, not absolute.
449                                     //   Value is rounded up, so zero really means zero occurences.
450     
451     boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
452                    false;
453     
454     String      fDeclaredEncoding;
455     
456     
457 
458     //
459     //  Stuff private to CharsetDetector
460     //
461     byte[]               fRawInput;     // Original, untouched input bytes.
462                                         //  If user gave us a byte array, this is it.
463                                         //  If user gave us a stream, it's read to a 
464                                         //  buffer here.
465     int                  fRawLength;    // Length of data in fRawInput array.
466     
467     InputStream          fInputStream;  // User's input stream, or null if the user
468                                         //   gave us a byte array.
469      
470     boolean              fStripTags =   // If true, setText() will strip tags from input text.
471                            false;
472     
473     
474     /*
475      * List of recognizers for all charsets known to the implementation.
476      */
477     private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
478     private static String [] fCharsetNames;
479     
480     /*
481      * Create the singleton instances of the CharsetRecognizer classes
482      */
483     private static ArrayList<CharsetRecognizer> createRecognizers() {
484         ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
485         
486         recognizers.add(new CharsetRecog_UTF8());
487         
488         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
489         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
490         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
491         recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
492         
493         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
494         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
495         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
496         recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
497         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
498         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
499         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
500         recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
501         
502         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_da());
503         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_de());
504         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_en());
505         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_es());
506         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_fr());
507         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_it());
508         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_nl());
509         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_no());
510         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_pt());
511         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1_sv());
512         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_cs());
513         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_hu());
514         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_pl());
515         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2_ro());
516         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
517         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
518         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
519         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
520         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
521         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
522         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
523         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
524         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
525         
526         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
527         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
528         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
529         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
530         
531         // Create an array of all charset names, as a side effect.
532         // Needed for the getAllDetectableCharsets() API.
533         String[] charsetNames = new String [recognizers.size()];
534         int out = 0;
535         
536         for (int i = 0; i < recognizers.size(); i++) {
537             String name = ((CharsetRecognizer)recognizers.get(i)).getName();
538             
539             if (out == 0 || ! name.equals(charsetNames[out - 1])) {
540                 charsetNames[out++] = name;
541             }
542         }
543         
544         fCharsetNames = new String[out];
545         System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
546         
547         return recognizers;
548     }
549 }