1 /** 2 ******************************************************************************* 3 * Copyright (C) 2005-2007, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package org.apache.tika.parser.txt; 8 9 import java.io.ByteArrayInputStream; 10 import java.io.IOException; 11 import java.io.InputStream; 12 import java.io.InputStreamReader; 13 import java.io.Reader; 14 15 16 /** 17 * This class represents a charset that has been identified by a CharsetDetector 18 * as a possible encoding for a set of input data. From an instance of this 19 * class, you can ask for a confidence level in the charset identification, 20 * or for Java Reader or String to access the original byte data in Unicode form. 21 * <p/> 22 * Instances of this class are created only by CharsetDetectors. 23 * <p/> 24 * Note: this class has a natural ordering that is inconsistent with equals. 25 * The natural ordering is based on the match confidence value. 26 * 27 * @stable ICU 3.4 28 */ 29 public class CharsetMatch implements Comparable<CharsetMatch> { 30 31 32 /** 33 * Create a java.io.Reader for reading the Unicode character data corresponding 34 * to the original byte data supplied to the Charset detect operation. 35 * <p/> 36 * CAUTION: if the source of the byte data was an InputStream, a Reader 37 * can be created for only one matching char set using this method. If more 38 * than one charset needs to be tried, the caller will need to reset 39 * the InputStream and create InputStreamReaders itself, based on the charset name. 40 * 41 * @return the Reader for the Unicode character data. 42 * 43 * @stable ICU 3.4 44 */ 45 public Reader getReader() { 46 InputStream inputStream = fInputStream; 47 48 if (inputStream == null) { 49 inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength); 50 } 51 52 try { 53 inputStream.reset(); 54 return new InputStreamReader(inputStream, getName()); 55 } catch (IOException e) { 56 return null; 57 } 58 } 59 60 /** 61 * Create a Java String from Unicode character data corresponding 62 * to the original byte data supplied to the Charset detect operation. 63 * 64 * @return a String created from the converted input data. 65 * 66 * @stable ICU 3.4 67 */ 68 public String getString() throws java.io.IOException { 69 return getString(-1); 70 71 } 72 73 /** 74 * Create a Java String from Unicode character data corresponding 75 * to the original byte data supplied to the Charset detect operation. 76 * The length of the returned string is limited to the specified size; 77 * the string will be trunctated to this length if necessary. A limit value of 78 * zero or less is ignored, and treated as no limit. 79 * 80 * @param maxLength The maximium length of the String to be created when the 81 * source of the data is an input stream, or -1 for 82 * unlimited length. 83 * @return a String created from the converted input data. 84 * 85 * @stable ICU 3.4 86 */ 87 public String getString(int maxLength) throws java.io.IOException { 88 String result = null; 89 if (fInputStream != null) { 90 StringBuffer sb = new StringBuffer(); 91 char[] buffer = new char[1024]; 92 Reader reader = getReader(); 93 int max = maxLength < 0? Integer.MAX_VALUE : maxLength; 94 int bytesRead = 0; 95 96 while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) { 97 sb.append(buffer, 0, bytesRead); 98 max -= bytesRead; 99 } 100 101 reader.close(); 102 103 return sb.toString(); 104 } else { 105 result = new String(fRawInput, getName()); 106 } 107 return result; 108 109 } 110 111 /** 112 * Get an indication of the confidence in the charset detected. 113 * Confidence values range from 0-100, with larger numbers indicating 114 * a better match of the input data to the characteristics of the 115 * charset. 116 * 117 * @return the confidence in the charset match 118 * 119 * @stable ICU 3.4 120 */ 121 public int getConfidence() { 122 return fConfidence; 123 } 124 125 126 /** 127 * Bit flag indicating the match is based on the the encoding scheme. 128 * 129 * @see #getMatchType 130 * @stable ICU 3.4 131 */ 132 static public final int ENCODING_SCHEME = 1; 133 134 /** 135 * Bit flag indicating the match is based on the presence of a BOM. 136 * 137 * @see #getMatchType 138 * @stable ICU 3.4 139 */ 140 static public final int BOM = 2; 141 142 /** 143 * Bit flag indicating he match is based on the declared encoding. 144 * 145 * @see #getMatchType 146 * @stable ICU 3.4 147 */ 148 static public final int DECLARED_ENCODING = 4; 149 150 /** 151 * Bit flag indicating the match is based on language statistics. 152 * 153 * @see #getMatchType 154 * @stable ICU 3.4 155 */ 156 static public final int LANG_STATISTICS = 8; 157 158 /** 159 * Return flags indicating what it was about the input data 160 * that caused this charset to be considered as a possible match. 161 * The result is a bitfield containing zero or more of the flags 162 * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS. 163 * A result of zero means no information is available. 164 * <p> 165 * Note: currently, this method always returns zero. 166 * <p> 167 * 168 * @return the type of match found for this charset. 169 * 170 * @draft ICU 3.4 171 * @provisional This API might change or be removed in a future release. 172 */ 173 public int getMatchType() { 174 // TODO: create a list of enum-like constants for common combinations of types of matches. 175 return 0; 176 } 177 178 /** 179 * Get the name of the detected charset. 180 * The name will be one that can be used with other APIs on the 181 * platform that accept charset names. It is the "Canonical name" 182 * as defined by the class java.nio.charset.Charset; for 183 * charsets that are registered with the IANA charset registry, 184 * this is the MIME-preferred registerd name. 185 * 186 * @see java.nio.charset.Charset 187 * @see java.io.InputStreamReader 188 * 189 * @return The name of the charset. 190 * 191 * @stable ICU 3.4 192 */ 193 public String getName() { 194 return fRecognizer.getName(); 195 } 196 197 /** 198 * Get the ISO code for the language of the detected charset. 199 * 200 * @return The ISO code for the language or <code>null</code> if the language cannot be determined. 201 * 202 * @stable ICU 3.4 203 */ 204 public String getLanguage() { 205 return fRecognizer.getLanguage(); 206 } 207 208 /** 209 * Compare to other CharsetMatch objects. 210 * Comparison is based on the match confidence value, which 211 * allows CharsetDetector.detectAll() to order its results. 212 * 213 * @param o the CharsetMatch object to compare against. 214 * @return a negative integer, zero, or a positive integer as the 215 * confidence level of this CharsetMatch 216 * is less than, equal to, or greater than that of 217 * the argument. 218 * @throws ClassCastException if the argument is not a CharsetMatch. 219 * @stable ICU 3.4 220 */ 221 public int compareTo(CharsetMatch other) { 222 int compareResult = 0; 223 if (this.fConfidence > other.fConfidence) { 224 compareResult = 1; 225 } else if (this.fConfidence < other.fConfidence) { 226 compareResult = -1; 227 } 228 return compareResult; 229 } 230 231 /* 232 * Constructor. Implementation internal 233 */ 234 CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) { 235 fRecognizer = rec; 236 fConfidence = conf; 237 238 // The references to the original aplication input data must be copied out 239 // of the charset recognizer to here, in case the application resets the 240 // recognizer before using this CharsetMatch. 241 if (det.fInputStream == null) { 242 // We only want the existing input byte data if it came straight from the user, 243 // not if is just the head of a stream. 244 fRawInput = det.fRawInput; 245 fRawLength = det.fRawLength; 246 } 247 fInputStream = det.fInputStream; 248 } 249 250 251 // 252 // Private Data 253 // 254 private int fConfidence; 255 private CharsetRecognizer fRecognizer; 256 private byte[] fRawInput = null; // Original, untouched input bytes. 257 // If user gave us a byte array, this is it. 258 private int fRawLength; // Length of data in fRawInput array. 259 260 private InputStream fInputStream = null; // User's input stream, or null if the user 261 // gave us a byte array. 262 263 }