View Javadoc

1   /**
2   *******************************************************************************
3   * Copyright (C) 2005-2007, International Business Machines Corporation and    *
4   * others. All Rights Reserved.                                                *
5   *******************************************************************************
6   */
7   package org.apache.tika.parser.txt;
8   
9   import java.io.ByteArrayInputStream;
10  import java.io.IOException;
11  import java.io.InputStream;
12  import java.io.InputStreamReader;
13  import java.io.Reader;
14  
15  
16  /**
17   * This class represents a charset that has been identified by a CharsetDetector
18   * as a possible encoding for a set of input data.  From an instance of this
19   * class, you can ask for a confidence level in the charset identification,
20   * or for Java Reader or String to access the original byte data in Unicode form.
21   * <p/>
22   * Instances of this class are created only by CharsetDetectors.
23   * <p/>
24   * Note:  this class has a natural ordering that is inconsistent with equals.
25   *        The natural ordering is based on the match confidence value.
26   *
27   * @stable ICU 3.4
28   */
29  public class CharsetMatch implements Comparable<CharsetMatch> {
30  
31      
32      /**
33       * Create a java.io.Reader for reading the Unicode character data corresponding
34       * to the original byte data supplied to the Charset detect operation.
35       * <p/>
36       * CAUTION:  if the source of the byte data was an InputStream, a Reader
37       * can be created for only one matching char set using this method.  If more 
38       * than one charset needs to be tried, the caller will need to reset
39       * the InputStream and create InputStreamReaders itself, based on the charset name.
40       *
41       * @return the Reader for the Unicode character data.
42       *
43       * @stable ICU 3.4
44       */
45      public Reader getReader() {
46          InputStream inputStream = fInputStream;
47          
48          if (inputStream == null) {
49              inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
50          }
51          
52          try {
53              inputStream.reset();
54              return new InputStreamReader(inputStream, getName());
55          } catch (IOException e) {
56              return null;
57          }
58      }
59  
60      /**
61       * Create a Java String from Unicode character data corresponding
62       * to the original byte data supplied to the Charset detect operation.
63       *
64       * @return a String created from the converted input data.
65       *
66       * @stable ICU 3.4
67       */
68      public String getString()  throws java.io.IOException {
69          return getString(-1);
70  
71      }
72  
73      /**
74       * Create a Java String from Unicode character data corresponding
75       * to the original byte data supplied to the Charset detect operation.
76       * The length of the returned string is limited to the specified size;
77       * the string will be trunctated to this length if necessary.  A limit value of
78       * zero or less is ignored, and treated as no limit.
79       *
80       * @param maxLength The maximium length of the String to be created when the
81       *                  source of the data is an input stream, or -1 for
82       *                  unlimited length.
83       * @return a String created from the converted input data.
84       *
85       * @stable ICU 3.4
86       */
87      public String getString(int maxLength) throws java.io.IOException {
88          String result = null;
89          if (fInputStream != null) {
90              StringBuffer sb = new StringBuffer();
91              char[] buffer = new char[1024];
92              Reader reader = getReader();
93              int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
94              int bytesRead = 0;
95              
96              while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
97                  sb.append(buffer, 0, bytesRead);
98                  max -= bytesRead;
99              }
100             
101             reader.close();
102             
103             return sb.toString();
104         } else {
105             result = new String(fRawInput, getName());            
106         }
107         return result;
108 
109     }
110     
111     /**
112      * Get an indication of the confidence in the charset detected.
113      * Confidence values range from 0-100, with larger numbers indicating
114      * a better match of the input data to the characteristics of the
115      * charset.
116      *
117      * @return the confidence in the charset match
118      *
119      * @stable ICU 3.4
120      */
121     public int getConfidence() {
122         return fConfidence;
123     }
124     
125 
126     /**
127      * Bit flag indicating the match is based on the the encoding scheme.
128      *
129      * @see #getMatchType
130      * @stable ICU 3.4
131      */
132     static public final int ENCODING_SCHEME    = 1;
133     
134     /**
135      * Bit flag indicating the match is based on the presence of a BOM.
136      * 
137      * @see #getMatchType
138      * @stable ICU 3.4
139      */
140     static public final int BOM                = 2;
141     
142     /**
143      * Bit flag indicating he match is based on the declared encoding.
144      * 
145      * @see #getMatchType
146      * @stable ICU 3.4
147      */
148     static public final int DECLARED_ENCODING  = 4;
149     
150     /**
151      * Bit flag indicating the match is based on language statistics.
152      *
153      * @see #getMatchType
154      * @stable ICU 3.4
155      */
156     static public final int LANG_STATISTICS    = 8;
157     
158     /**
159      * Return flags indicating what it was about the input data 
160      * that caused this charset to be considered as a possible match.
161      * The result is a bitfield containing zero or more of the flags
162      * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
163      * A result of zero means no information is available.
164      * <p>
165      * Note: currently, this method always returns zero.
166      * <p>
167      *
168      * @return the type of match found for this charset.
169      *
170      * @draft ICU 3.4
171      * @provisional This API might change or be removed in a future release.
172      */
173     public int getMatchType() {
174 //      TODO: create a list of enum-like constants for common combinations of types of matches.
175         return 0;
176     }
177 
178     /**
179      * Get the name of the detected charset.  
180      * The name will be one that can be used with other APIs on the
181      * platform that accept charset names.  It is the "Canonical name"
182      * as defined by the class java.nio.charset.Charset; for
183      * charsets that are registered with the IANA charset registry,
184      * this is the MIME-preferred registerd name.
185      *
186      * @see java.nio.charset.Charset
187      * @see java.io.InputStreamReader
188      *
189      * @return The name of the charset.
190      *
191      * @stable ICU 3.4
192      */
193     public String getName() {
194         return fRecognizer.getName();
195     }
196     
197     /**
198      * Get the ISO code for the language of the detected charset.  
199      *
200      * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
201      *
202      * @stable ICU 3.4
203      */
204     public String getLanguage() {
205         return fRecognizer.getLanguage();
206     }
207 
208     /**
209      * Compare to other CharsetMatch objects.
210      * Comparison is based on the match confidence value, which 
211      *   allows CharsetDetector.detectAll() to order its results. 
212      *
213      * @param o the CharsetMatch object to compare against.
214      * @return  a negative integer, zero, or a positive integer as the 
215      *          confidence level of this CharsetMatch
216      *          is less than, equal to, or greater than that of
217      *          the argument.
218      * @throws ClassCastException if the argument is not a CharsetMatch.
219      * @stable ICU 3.4
220      */
221     public int compareTo(CharsetMatch other) {
222         int compareResult = 0;
223         if (this.fConfidence > other.fConfidence) {
224             compareResult = 1;
225         } else if (this.fConfidence < other.fConfidence) {
226             compareResult = -1;
227         }
228         return compareResult;
229     }
230     
231     /*
232      *  Constructor.  Implementation internal
233      */
234     CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
235         fRecognizer = rec;
236         fConfidence = conf;
237         
238         // The references to the original aplication input data must be copied out
239         //   of the charset recognizer to here, in case the application resets the
240         //   recognizer before using this CharsetMatch.
241         if (det.fInputStream == null) {
242             // We only want the existing input byte data if it came straight from the user,
243             //   not if is just the head of a stream.
244             fRawInput    = det.fRawInput;
245             fRawLength   = det.fRawLength;
246         }
247         fInputStream = det.fInputStream;
248     }
249 
250     
251     //
252     //   Private Data
253     //
254     private int                 fConfidence;
255     private CharsetRecognizer   fRecognizer;
256     private byte[]              fRawInput = null;     // Original, untouched input bytes.
257                                                       //  If user gave us a byte array, this is it.
258     private int                 fRawLength;           // Length of data in fRawInput array.
259 
260     private InputStream         fInputStream = null;  // User's input stream, or null if the user
261                                                       //   gave us a byte array.
262 
263 }