View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.utils;
18  
19  //JDK imports
20  import java.io.BufferedInputStream;
21  import java.io.File;
22  import java.io.FileInputStream;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.net.URL;
26  
27  import org.apache.tika.config.TikaConfig;
28  import org.apache.tika.exception.TikaException;
29  import org.apache.tika.metadata.Metadata;
30  import org.apache.tika.metadata.TikaMimeKeys;
31  import org.apache.tika.parser.Parser;
32  import org.apache.tika.sax.BodyContentHandler;
33  import org.xml.sax.ContentHandler;
34  import org.xml.sax.SAXException;
35  
36  /**
37   * Contains utility methods for parsing documents. Intended to provide simple
38   * entry points into the Tika framework.
39   */
40  public class ParseUtils implements TikaMimeKeys {
41  
42      /**
43       * Returns a parser that can handle the specified MIME type, and is set to
44       * receive input from a stream opened from the specified URL. NB: Close the
45       * input stream when it is no longer needed!
46       * 
47       * @param config
48       * @param mimeType
49       *            the document's MIME type
50       * @return a parser appropriate to this MIME type
51       * @throws TikaException
52       */
53      public static Parser getParser(String mimeType, TikaConfig config)
54              throws TikaException {
55          return config.getParser(mimeType);
56      }
57  
58      /**
59       * Returns a parser that can handle the specified MIME type, and is set to
60       * receive input from a stream opened from the specified URL. The MIME type
61       * is determined automatically. NB: Close the input stream when it is no
62       * longer needed!
63       * 
64       * @param documentUrl
65       *            URL pointing to the document to parse
66       * @param config
67       * @return a parser appropriate to this MIME type and ready to read input
68       *         from the specified document
69       * @throws TikaException
70       */
71      public static Parser getParser(URL documentUrl, TikaConfig config)
72              throws TikaException {
73          String mimetype = config.getMimeRepository().getMimeType(documentUrl)
74          .getName();
75          return getParser(mimetype, config);
76      }
77  
78      /**
79       * Returns a parser that can handle the specified MIME type, and is set to
80       * receive input from a stream opened from the specified URL. NB: Close the
81       * input stream when it is no longer needed!
82       * 
83       * @param documentFile
84       *            File object pointing to the document to parse
85       * @param config
86       * @return a parser appropriate to this MIME type and ready to read input
87       *         from the specified document
88       * @throws TikaException
89       */
90      public static Parser getParser(File documentFile, TikaConfig config)
91              throws TikaException {
92          String mimetype = config.getMimeRepository().getMimeType(documentFile)
93          .getName();
94          return getParser(mimetype, config);
95      }
96  
97      /**
98       * Gets the string content of a document read from an input stream.
99       * 
100      * @param stream the stream from which to read document data
101      * @param config
102      * @param mimeType MIME type of the data
103      * @return the string content parsed from the document
104      */
105     public static String getStringContent(
106             InputStream stream, TikaConfig config, String mimeType)
107             throws TikaException, IOException {
108         try {
109             Parser parser = config.getParser(mimeType);
110             ContentHandler handler = new BodyContentHandler();
111             parser.parse(stream, handler, new Metadata());
112             return handler.toString();
113         } catch (SAXException e) {
114             throw new TikaException("Unexpected SAX error", e);
115         }
116     }
117 
118     /**
119      * Gets the string content of a document read from an input stream.
120      * 
121      * @param documentUrl
122      *            URL pointing to the document to parse
123      * @param config
124      * @return the string content parsed from the document
125      */
126     public static String getStringContent(URL documentUrl, TikaConfig config)
127             throws TikaException, IOException {
128         String mime = config.getMimeRepository().getMimeType(documentUrl)
129         .getName();
130         return getStringContent(documentUrl, config, mime);
131     }
132 
133     /**
134      * Gets the string content of a document read from an input stream.
135      * 
136      * @param documentUrl
137      *            URL pointing to the document to parse
138      * @param config
139      * @param mimeType
140      *            MIME type of the data
141      * @return the string content parsed from the document
142      */
143     public static String getStringContent(
144             URL documentUrl, TikaConfig config, String mimeType)
145             throws TikaException, IOException {
146         InputStream stream = documentUrl.openStream();
147         try {
148             return getStringContent(stream, config, mimeType);
149         } finally {
150             stream.close();
151         }
152     }
153 
154     /**
155      * Gets the string content of a document read from an input stream.
156      * 
157      * @param documentFile
158      *            File object pointing to the document to parse
159      * @param config
160      * @param mimeType
161      *            MIME type of the data
162      * @return the string content parsed from the document
163      */
164     public static String getStringContent(
165             File documentFile, TikaConfig config, String mimeType)
166             throws TikaException, IOException {
167         InputStream stream = new BufferedInputStream(new FileInputStream(
168                 documentFile));
169         try {
170             return getStringContent(stream, config, mimeType);
171         } finally {
172             stream.close();
173         }
174     }
175 
176     /**
177      * Gets the string content of a document read from an input stream.
178      * 
179      * @param documentFile
180      *            File object pointing to the document to parse
181      * @param config
182      * @return the string content parsed from the document
183      */
184     public static String getStringContent(File documentFile, TikaConfig config)
185             throws TikaException, IOException {
186         String mime =
187             config.getMimeRepository().getMimeType(documentFile).getName();
188         return getStringContent(documentFile, config, mime);
189     }
190 
191 }