View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika;
18  
19  import java.io.BufferedInputStream;
20  import java.io.File;
21  import java.io.IOException;
22  import java.io.InputStream;
23  import java.io.Reader;
24  import java.net.URL;
25  
26  import org.apache.tika.config.TikaConfig;
27  import org.apache.tika.detect.Detector;
28  import org.apache.tika.exception.TikaException;
29  import org.apache.tika.metadata.Metadata;
30  import org.apache.tika.metadata.MetadataHelper;
31  import org.apache.tika.parser.AutoDetectParser;
32  import org.apache.tika.parser.ParseContext;
33  import org.apache.tika.parser.Parser;
34  import org.apache.tika.parser.ParsingReader;
35  import org.apache.tika.sax.BodyContentHandler;
36  import org.apache.tika.sax.WriteOutContentHandler;
37  import org.xml.sax.SAXException;
38  
39  /**
40   * Facade class for accessing Tika functionality. This class hides much of
41   * the underlying complexity of the lower level Tika classes and provides
42   * simple methods for many common parsing and type detection operations.
43   *
44   * @since Apache Tika 0.5
45   * @see Parser
46   * @see Detector
47   */
48  public class Tika {
49  
50      /**
51       * The detector instance used by this facade.
52       */
53      private final Detector detector;
54  
55      /**
56       * The parser instance used by this facade.
57       */
58      private final Parser parser;
59  
60      /**
61       * Maximum length of the strings returned by the parseToString methods.
62       * Used to prevent out of memory problems with huge input documents.
63       * The default setting is 100k characters.
64       */
65      private int maxStringLength = 100 * 1000;
66  
67      /**
68       * Creates a Tika facade using the given configuration.
69       *
70       * @param config Tika configuration
71       */
72      public Tika(TikaConfig config) {
73          this.detector = config.getMimeRepository();
74          this.parser = new AutoDetectParser(config);
75      }
76  
77      /**
78       * Creates a Tika facade using the default configuration.
79       */
80      public Tika() {
81          this(TikaConfig.getDefaultConfig());
82      }
83  
84      /**
85       * Detects the media type of the given document. The type detection is
86       * based on the content of the given document stream and any given
87       * document metadata. The document stream can be <code>null</code>,
88       * in which case only the given document metadata is used for type
89       * detection.
90       * <p>
91       * If the document stream supports the
92       * {@link InputStream#markSupported() mark feature}, then the stream is
93       * marked and reset to the original position before this method returns.
94       * Only a limited number of bytes are read from the stream.
95       * <p>
96       * The given document stream is <em>not</em> closed by this method.
97       * <p>
98       * Unlike in the {@link #parse(InputStream, Metadata)} method, the
99       * given document metadata is <em>not</em> modified by this method.
100      *
101      * @param stream the document stream, or <code>null</code>
102      * @param metadata document metadata
103      * @return detected media type
104      * @throws IOException if the stream can not be read
105      */
106     public String detect(InputStream stream, Metadata metadata)
107             throws IOException {
108         if (stream == null || stream.markSupported()) {
109             return detector.detect(stream, metadata).toString();
110         } else {
111             return detector.detect(
112                     new BufferedInputStream(stream), metadata).toString();
113         }
114     }
115 
116     /**
117      * Detects the media type of the given document. The type detection is
118      * based on the content of the given document stream.
119      * <p>
120      * If the document stream supports the
121      * {@link InputStream#markSupported() mark feature}, then the stream is
122      * marked and reset to the original position before this method returns.
123      * Only a limited number of bytes are read from the stream.
124      * <p>
125      * The given document stream is <em>not</em> closed by this method.
126      *
127      * @param stream the document stream
128      * @return detected media type
129      * @throws IOException if the stream can not be read
130      */
131     public String detect(InputStream stream) throws IOException {
132         return detect(stream, new Metadata());
133     }
134 
135     /**
136      * Detects the media type of the given file. The type detection is
137      * based on the document content and a potential known file extension.
138      * <p>
139      * Use the {@link #detect(String)} method when you want to detect the
140      * type of the document without actually accessing the file.
141      *
142      * @param file the file
143      * @return detected media type
144      * @throws IOException if the file can not be read
145      */
146     public String detect(File file) throws IOException {
147         return detect(file.toURI().toURL());
148     }
149 
150     /**
151      * Detects the media type of the resource at the given URL. The type
152      * detection is based on the document content and a potential known
153      * file extension included in the URL.
154      * <p>
155      * Use the {@link #detect(String)} method when you want to detect the
156      * type of the document without actually accessing the URL.
157      *
158      * @param url the URL of the resource
159      * @return detected media type
160      * @throws IOException if the resource can not be read
161      */
162     public String detect(URL url) throws IOException {
163         Metadata metadata = new Metadata();
164         InputStream stream = MetadataHelper.getInputStream(url, metadata);
165         try {
166             return detect(stream, metadata);
167         } finally {
168             stream.close();
169         }
170     }
171 
172     /**
173      * Detects the media type of a document with the given file name.
174      * The type detection is based on known file name extensions.
175      * <p>
176      * The given name can also be a URL or a full file path. In such cases
177      * only the file name part of the string is used for type detection. 
178      *
179      * @param name the file name of the document
180      * @return detected media type
181      */
182     public String detect(String name) {
183         Metadata metadata = new Metadata();
184         metadata.set(Metadata.RESOURCE_NAME_KEY, name);
185         try {
186             return detect(null, metadata);
187         } catch (IOException e) {
188             throw new IllegalStateException("Unexpected IOException", e);
189         }
190     }
191 
192     /**
193      * Parses the given document and returns the extracted text content.
194      * Input metadata like a file name or a content type hint can be passed
195      * in the given metadata instance. Metadata information extracted from
196      * the document is returned in that same metadata instance.
197      *
198      * @param stream the document to be parsed
199      * @return extracted text content
200      * @throws IOException if the document can not be read or parsed
201      */
202     public Reader parse(InputStream stream, Metadata metadata)
203             throws IOException {
204         ParseContext context = new ParseContext();
205         context.set(Parser.class, parser);
206         return new ParsingReader(parser, stream, metadata, context);
207     }
208 
209     /**
210      * Parses the given document and returns the extracted text content.
211      *
212      * @param stream the document to be parsed
213      * @return extracted text content
214      * @throws IOException if the document can not be read or parsed
215      */
216     public Reader parse(InputStream stream) throws IOException {
217         return parse(stream, new Metadata());
218     }
219 
220     /**
221      * Parses the given file and returns the extracted text content.
222      *
223      * @param file the file to be parsed
224      * @return extracted text content
225      * @throws IOException if the file can not be read or parsed
226      */
227     public Reader parse(File file) throws IOException {
228         return parse(file.toURI().toURL());
229     }
230 
231     /**
232      * Parses the resource at the given URL and returns the extracted
233      * text content.
234      *
235      * @param url the URL of the resource to be parsed
236      * @return extracted text content
237      * @throws IOException if the resource can not be read or parsed
238      */
239     public Reader parse(URL url) throws IOException {
240         Metadata metadata = new Metadata();
241         InputStream stream = MetadataHelper.getInputStream(url, metadata);
242         return parse(stream, metadata);
243     }
244 
245     /**
246      * Parses the given document and returns the extracted text content.
247      * The given input stream is closed by this method.
248      * <p>
249      * To avoid unpredictable excess memory use, the returned string contains
250      * only up to {@link #getMaxStringLength()} first characters extracted
251      * from the input document. Use the {@link #setMaxStringLength(int)}
252      * method to adjust this limitation.
253      *
254      * @param stream the document to be parsed
255      * @param metadata document metadata
256      * @return extracted text content
257      * @throws IOException if the document can not be read
258      * @throws TikaException if the document can not be parsed
259      */
260     public String parseToString(InputStream stream, Metadata metadata)
261             throws IOException, TikaException {
262         WriteOutContentHandler handler =
263             new WriteOutContentHandler(maxStringLength);
264         try {
265             ParseContext context = new ParseContext();
266             context.set(Parser.class, parser);
267             parser.parse(
268                     stream, new BodyContentHandler(handler), metadata, context);
269         } catch (SAXException e) {
270             if (!handler.isWriteLimitReached(e)) {
271                 // This should never happen with BodyContentHandler...
272                 throw new TikaException("Unexpected SAX processing failure", e);
273             }
274         } finally {
275             stream.close();
276         }
277         return handler.toString();
278     }
279 
280     /**
281      * Parses the given document and returns the extracted text content.
282      * The given input stream is closed by this method.
283      * <p>
284      * To avoid unpredictable excess memory use, the returned string contains
285      * only up to {@link #getMaxStringLength()} first characters extracted
286      * from the input document. Use the {@link #setMaxStringLength(int)}
287      * method to adjust this limitation.
288      *
289      * @param stream the document to be parsed
290      * @return extracted text content
291      * @throws IOException if the document can not be read
292      * @throws TikaException if the document can not be parsed
293      */
294     public String parseToString(InputStream stream)
295             throws IOException, TikaException {
296         return parseToString(stream, new Metadata());
297     }
298 
299     /**
300      * Parses the given file and returns the extracted text content.
301      * <p>
302      * To avoid unpredictable excess memory use, the returned string contains
303      * only up to {@link #getMaxStringLength()} first characters extracted
304      * from the input document. Use the {@link #setMaxStringLength(int)}
305      * method to adjust this limitation.
306      *
307      * @param file the file to be parsed
308      * @return extracted text content
309      * @throws IOException if the file can not be read
310      * @throws TikaException if the file can not be parsed
311      */
312     public String parseToString(File file) throws IOException, TikaException {
313         return parseToString(file.toURI().toURL());
314     }
315 
316     /**
317      * Parses the resource at the given URL and returns the extracted
318      * text content.
319      * <p>
320      * To avoid unpredictable excess memory use, the returned string contains
321      * only up to {@link #getMaxStringLength()} first characters extracted
322      * from the input document. Use the {@link #setMaxStringLength(int)}
323      * method to adjust this limitation.
324      *
325      * @param url the URL of the resource to be parsed
326      * @return extracted text content
327      * @throws IOException if the resource can not be read
328      * @throws TikaException if the resource can not be parsed
329      */
330     public String parseToString(URL url) throws IOException, TikaException {
331         Metadata metadata = new Metadata();
332         InputStream stream = MetadataHelper.getInputStream(url, metadata);
333         return parseToString(stream, metadata);
334     }
335 
336     /**
337      * Returns the maximum length of strings returned by the
338      * parseToString methods.
339      *
340      * @since Apache Tika 0.7
341      * @return maximum string length, or -1 if the limit has been disabled
342      */
343     public int getMaxStringLength() {
344         return maxStringLength;
345     }
346 
347     /**
348      * Sets the maximum length of strings returned by the parseToString
349      * methods.
350      *
351      * @since Apache Tika 0.7
352      * @param maxStringLength maximum string length,
353      *                        or -1 to disable this limit
354      */
355     public void setMaxStringLength(int maxStringLength) {
356         this.maxStringLength = maxStringLength;
357     }
358 
359 }