1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika; 18 19 import java.io.BufferedInputStream; 20 import java.io.File; 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.io.Reader; 24 import java.net.URL; 25 26 import org.apache.tika.config.TikaConfig; 27 import org.apache.tika.detect.Detector; 28 import org.apache.tika.exception.TikaException; 29 import org.apache.tika.metadata.Metadata; 30 import org.apache.tika.metadata.MetadataHelper; 31 import org.apache.tika.parser.AutoDetectParser; 32 import org.apache.tika.parser.ParseContext; 33 import org.apache.tika.parser.Parser; 34 import org.apache.tika.parser.ParsingReader; 35 import org.apache.tika.sax.BodyContentHandler; 36 import org.apache.tika.sax.WriteOutContentHandler; 37 import org.xml.sax.SAXException; 38 39 /** 40 * Facade class for accessing Tika functionality. This class hides much of 41 * the underlying complexity of the lower level Tika classes and provides 42 * simple methods for many common parsing and type detection operations. 43 * 44 * @since Apache Tika 0.5 45 * @see Parser 46 * @see Detector 47 */ 48 public class Tika { 49 50 /** 51 * The detector instance used by this facade. 52 */ 53 private final Detector detector; 54 55 /** 56 * The parser instance used by this facade. 57 */ 58 private final Parser parser; 59 60 /** 61 * Maximum length of the strings returned by the parseToString methods. 62 * Used to prevent out of memory problems with huge input documents. 63 * The default setting is 100k characters. 64 */ 65 private int maxStringLength = 100 * 1000; 66 67 /** 68 * Creates a Tika facade using the given configuration. 69 * 70 * @param config Tika configuration 71 */ 72 public Tika(TikaConfig config) { 73 this.detector = config.getMimeRepository(); 74 this.parser = new AutoDetectParser(config); 75 } 76 77 /** 78 * Creates a Tika facade using the default configuration. 79 */ 80 public Tika() { 81 this(TikaConfig.getDefaultConfig()); 82 } 83 84 /** 85 * Detects the media type of the given document. The type detection is 86 * based on the content of the given document stream and any given 87 * document metadata. The document stream can be <code>null</code>, 88 * in which case only the given document metadata is used for type 89 * detection. 90 * <p> 91 * If the document stream supports the 92 * {@link InputStream#markSupported() mark feature}, then the stream is 93 * marked and reset to the original position before this method returns. 94 * Only a limited number of bytes are read from the stream. 95 * <p> 96 * The given document stream is <em>not</em> closed by this method. 97 * <p> 98 * Unlike in the {@link #parse(InputStream, Metadata)} method, the 99 * given document metadata is <em>not</em> modified by this method. 100 * 101 * @param stream the document stream, or <code>null</code> 102 * @param metadata document metadata 103 * @return detected media type 104 * @throws IOException if the stream can not be read 105 */ 106 public String detect(InputStream stream, Metadata metadata) 107 throws IOException { 108 if (stream == null || stream.markSupported()) { 109 return detector.detect(stream, metadata).toString(); 110 } else { 111 return detector.detect( 112 new BufferedInputStream(stream), metadata).toString(); 113 } 114 } 115 116 /** 117 * Detects the media type of the given document. The type detection is 118 * based on the content of the given document stream. 119 * <p> 120 * If the document stream supports the 121 * {@link InputStream#markSupported() mark feature}, then the stream is 122 * marked and reset to the original position before this method returns. 123 * Only a limited number of bytes are read from the stream. 124 * <p> 125 * The given document stream is <em>not</em> closed by this method. 126 * 127 * @param stream the document stream 128 * @return detected media type 129 * @throws IOException if the stream can not be read 130 */ 131 public String detect(InputStream stream) throws IOException { 132 return detect(stream, new Metadata()); 133 } 134 135 /** 136 * Detects the media type of the given file. The type detection is 137 * based on the document content and a potential known file extension. 138 * <p> 139 * Use the {@link #detect(String)} method when you want to detect the 140 * type of the document without actually accessing the file. 141 * 142 * @param file the file 143 * @return detected media type 144 * @throws IOException if the file can not be read 145 */ 146 public String detect(File file) throws IOException { 147 return detect(file.toURI().toURL()); 148 } 149 150 /** 151 * Detects the media type of the resource at the given URL. The type 152 * detection is based on the document content and a potential known 153 * file extension included in the URL. 154 * <p> 155 * Use the {@link #detect(String)} method when you want to detect the 156 * type of the document without actually accessing the URL. 157 * 158 * @param url the URL of the resource 159 * @return detected media type 160 * @throws IOException if the resource can not be read 161 */ 162 public String detect(URL url) throws IOException { 163 Metadata metadata = new Metadata(); 164 InputStream stream = MetadataHelper.getInputStream(url, metadata); 165 try { 166 return detect(stream, metadata); 167 } finally { 168 stream.close(); 169 } 170 } 171 172 /** 173 * Detects the media type of a document with the given file name. 174 * The type detection is based on known file name extensions. 175 * <p> 176 * The given name can also be a URL or a full file path. In such cases 177 * only the file name part of the string is used for type detection. 178 * 179 * @param name the file name of the document 180 * @return detected media type 181 */ 182 public String detect(String name) { 183 Metadata metadata = new Metadata(); 184 metadata.set(Metadata.RESOURCE_NAME_KEY, name); 185 try { 186 return detect(null, metadata); 187 } catch (IOException e) { 188 throw new IllegalStateException("Unexpected IOException", e); 189 } 190 } 191 192 /** 193 * Parses the given document and returns the extracted text content. 194 * Input metadata like a file name or a content type hint can be passed 195 * in the given metadata instance. Metadata information extracted from 196 * the document is returned in that same metadata instance. 197 * 198 * @param stream the document to be parsed 199 * @return extracted text content 200 * @throws IOException if the document can not be read or parsed 201 */ 202 public Reader parse(InputStream stream, Metadata metadata) 203 throws IOException { 204 ParseContext context = new ParseContext(); 205 context.set(Parser.class, parser); 206 return new ParsingReader(parser, stream, metadata, context); 207 } 208 209 /** 210 * Parses the given document and returns the extracted text content. 211 * 212 * @param stream the document to be parsed 213 * @return extracted text content 214 * @throws IOException if the document can not be read or parsed 215 */ 216 public Reader parse(InputStream stream) throws IOException { 217 return parse(stream, new Metadata()); 218 } 219 220 /** 221 * Parses the given file and returns the extracted text content. 222 * 223 * @param file the file to be parsed 224 * @return extracted text content 225 * @throws IOException if the file can not be read or parsed 226 */ 227 public Reader parse(File file) throws IOException { 228 return parse(file.toURI().toURL()); 229 } 230 231 /** 232 * Parses the resource at the given URL and returns the extracted 233 * text content. 234 * 235 * @param url the URL of the resource to be parsed 236 * @return extracted text content 237 * @throws IOException if the resource can not be read or parsed 238 */ 239 public Reader parse(URL url) throws IOException { 240 Metadata metadata = new Metadata(); 241 InputStream stream = MetadataHelper.getInputStream(url, metadata); 242 return parse(stream, metadata); 243 } 244 245 /** 246 * Parses the given document and returns the extracted text content. 247 * The given input stream is closed by this method. 248 * <p> 249 * To avoid unpredictable excess memory use, the returned string contains 250 * only up to {@link #getMaxStringLength()} first characters extracted 251 * from the input document. Use the {@link #setMaxStringLength(int)} 252 * method to adjust this limitation. 253 * 254 * @param stream the document to be parsed 255 * @param metadata document metadata 256 * @return extracted text content 257 * @throws IOException if the document can not be read 258 * @throws TikaException if the document can not be parsed 259 */ 260 public String parseToString(InputStream stream, Metadata metadata) 261 throws IOException, TikaException { 262 WriteOutContentHandler handler = 263 new WriteOutContentHandler(maxStringLength); 264 try { 265 ParseContext context = new ParseContext(); 266 context.set(Parser.class, parser); 267 parser.parse( 268 stream, new BodyContentHandler(handler), metadata, context); 269 } catch (SAXException e) { 270 if (!handler.isWriteLimitReached(e)) { 271 // This should never happen with BodyContentHandler... 272 throw new TikaException("Unexpected SAX processing failure", e); 273 } 274 } finally { 275 stream.close(); 276 } 277 return handler.toString(); 278 } 279 280 /** 281 * Parses the given document and returns the extracted text content. 282 * The given input stream is closed by this method. 283 * <p> 284 * To avoid unpredictable excess memory use, the returned string contains 285 * only up to {@link #getMaxStringLength()} first characters extracted 286 * from the input document. Use the {@link #setMaxStringLength(int)} 287 * method to adjust this limitation. 288 * 289 * @param stream the document to be parsed 290 * @return extracted text content 291 * @throws IOException if the document can not be read 292 * @throws TikaException if the document can not be parsed 293 */ 294 public String parseToString(InputStream stream) 295 throws IOException, TikaException { 296 return parseToString(stream, new Metadata()); 297 } 298 299 /** 300 * Parses the given file and returns the extracted text content. 301 * <p> 302 * To avoid unpredictable excess memory use, the returned string contains 303 * only up to {@link #getMaxStringLength()} first characters extracted 304 * from the input document. Use the {@link #setMaxStringLength(int)} 305 * method to adjust this limitation. 306 * 307 * @param file the file to be parsed 308 * @return extracted text content 309 * @throws IOException if the file can not be read 310 * @throws TikaException if the file can not be parsed 311 */ 312 public String parseToString(File file) throws IOException, TikaException { 313 return parseToString(file.toURI().toURL()); 314 } 315 316 /** 317 * Parses the resource at the given URL and returns the extracted 318 * text content. 319 * <p> 320 * To avoid unpredictable excess memory use, the returned string contains 321 * only up to {@link #getMaxStringLength()} first characters extracted 322 * from the input document. Use the {@link #setMaxStringLength(int)} 323 * method to adjust this limitation. 324 * 325 * @param url the URL of the resource to be parsed 326 * @return extracted text content 327 * @throws IOException if the resource can not be read 328 * @throws TikaException if the resource can not be parsed 329 */ 330 public String parseToString(URL url) throws IOException, TikaException { 331 Metadata metadata = new Metadata(); 332 InputStream stream = MetadataHelper.getInputStream(url, metadata); 333 return parseToString(stream, metadata); 334 } 335 336 /** 337 * Returns the maximum length of strings returned by the 338 * parseToString methods. 339 * 340 * @since Apache Tika 0.7 341 * @return maximum string length, or -1 if the limit has been disabled 342 */ 343 public int getMaxStringLength() { 344 return maxStringLength; 345 } 346 347 /** 348 * Sets the maximum length of strings returned by the parseToString 349 * methods. 350 * 351 * @since Apache Tika 0.7 352 * @param maxStringLength maximum string length, 353 * or -1 to disable this limit 354 */ 355 public void setMaxStringLength(int maxStringLength) { 356 this.maxStringLength = maxStringLength; 357 } 358 359 }