1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.tika.utils;
18
19 //JDK imports
20 import java.io.BufferedInputStream;
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.net.URL;
26
27 import org.apache.tika.config.TikaConfig;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.metadata.Metadata;
30 import org.apache.tika.metadata.TikaMimeKeys;
31 import org.apache.tika.parser.Parser;
32 import org.apache.tika.sax.BodyContentHandler;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35
36 /**
37 * Contains utility methods for parsing documents. Intended to provide simple
38 * entry points into the Tika framework.
39 */
40 public class ParseUtils implements TikaMimeKeys {
41
42 /**
43 * Returns a parser that can handle the specified MIME type, and is set to
44 * receive input from a stream opened from the specified URL. NB: Close the
45 * input stream when it is no longer needed!
46 *
47 * @param config
48 * @param mimeType
49 * the document's MIME type
50 * @return a parser appropriate to this MIME type
51 * @throws TikaException
52 */
53 public static Parser getParser(String mimeType, TikaConfig config)
54 throws TikaException {
55 return config.getParser(mimeType);
56 }
57
58 /**
59 * Returns a parser that can handle the specified MIME type, and is set to
60 * receive input from a stream opened from the specified URL. The MIME type
61 * is determined automatically. NB: Close the input stream when it is no
62 * longer needed!
63 *
64 * @param documentUrl
65 * URL pointing to the document to parse
66 * @param config
67 * @return a parser appropriate to this MIME type and ready to read input
68 * from the specified document
69 * @throws TikaException
70 */
71 public static Parser getParser(URL documentUrl, TikaConfig config)
72 throws TikaException {
73 String mimetype = config.getMimeRepository().getMimeType(documentUrl)
74 .getName();
75 return getParser(mimetype, config);
76 }
77
78 /**
79 * Returns a parser that can handle the specified MIME type, and is set to
80 * receive input from a stream opened from the specified URL. NB: Close the
81 * input stream when it is no longer needed!
82 *
83 * @param documentFile
84 * File object pointing to the document to parse
85 * @param config
86 * @return a parser appropriate to this MIME type and ready to read input
87 * from the specified document
88 * @throws TikaException
89 */
90 public static Parser getParser(File documentFile, TikaConfig config)
91 throws TikaException {
92 String mimetype = config.getMimeRepository().getMimeType(documentFile)
93 .getName();
94 return getParser(mimetype, config);
95 }
96
97 /**
98 * Gets the string content of a document read from an input stream.
99 *
100 * @param stream the stream from which to read document data
101 * @param config
102 * @param mimeType MIME type of the data
103 * @return the string content parsed from the document
104 */
105 public static String getStringContent(
106 InputStream stream, TikaConfig config, String mimeType)
107 throws TikaException, IOException {
108 try {
109 Parser parser = config.getParser(mimeType);
110 ContentHandler handler = new BodyContentHandler();
111 parser.parse(stream, handler, new Metadata());
112 return handler.toString();
113 } catch (SAXException e) {
114 throw new TikaException("Unexpected SAX error", e);
115 }
116 }
117
118 /**
119 * Gets the string content of a document read from an input stream.
120 *
121 * @param documentUrl
122 * URL pointing to the document to parse
123 * @param config
124 * @return the string content parsed from the document
125 */
126 public static String getStringContent(URL documentUrl, TikaConfig config)
127 throws TikaException, IOException {
128 String mime = config.getMimeRepository().getMimeType(documentUrl)
129 .getName();
130 return getStringContent(documentUrl, config, mime);
131 }
132
133 /**
134 * Gets the string content of a document read from an input stream.
135 *
136 * @param documentUrl
137 * URL pointing to the document to parse
138 * @param config
139 * @param mimeType
140 * MIME type of the data
141 * @return the string content parsed from the document
142 */
143 public static String getStringContent(
144 URL documentUrl, TikaConfig config, String mimeType)
145 throws TikaException, IOException {
146 InputStream stream = documentUrl.openStream();
147 try {
148 return getStringContent(stream, config, mimeType);
149 } finally {
150 stream.close();
151 }
152 }
153
154 /**
155 * Gets the string content of a document read from an input stream.
156 *
157 * @param documentFile
158 * File object pointing to the document to parse
159 * @param config
160 * @param mimeType
161 * MIME type of the data
162 * @return the string content parsed from the document
163 */
164 public static String getStringContent(
165 File documentFile, TikaConfig config, String mimeType)
166 throws TikaException, IOException {
167 InputStream stream = new BufferedInputStream(new FileInputStream(
168 documentFile));
169 try {
170 return getStringContent(stream, config, mimeType);
171 } finally {
172 stream.close();
173 }
174 }
175
176 /**
177 * Gets the string content of a document read from an input stream.
178 *
179 * @param documentFile
180 * File object pointing to the document to parse
181 * @param config
182 * @return the string content parsed from the document
183 */
184 public static String getStringContent(File documentFile, TikaConfig config)
185 throws TikaException, IOException {
186 String mime =
187 config.getMimeRepository().getMimeType(documentFile).getName();
188 return getStringContent(documentFile, config, mime);
189 }
190
191 }