1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.config;
18
19 import java.io.File;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.URL;
23 import java.util.HashMap;
24 import java.util.Iterator;
25 import java.util.Map;
26
27 import javax.imageio.spi.ServiceRegistry;
28 import javax.xml.parsers.DocumentBuilder;
29 import javax.xml.parsers.DocumentBuilderFactory;
30 import javax.xml.parsers.ParserConfigurationException;
31
32 import org.apache.tika.exception.TikaException;
33 import org.apache.tika.mime.MediaType;
34 import org.apache.tika.mime.MimeTypeException;
35 import org.apache.tika.mime.MimeTypes;
36 import org.apache.tika.mime.MimeTypesFactory;
37 import org.apache.tika.parser.ParseContext;
38 import org.apache.tika.parser.Parser;
39 import org.w3c.dom.Document;
40 import org.w3c.dom.Element;
41 import org.w3c.dom.Node;
42 import org.w3c.dom.NodeList;
43 import org.xml.sax.SAXException;
44
45
46
47
48 public class TikaConfig {
49
50 private final Map<String, Parser> parsers = new HashMap<String, Parser>();
51
52 private final MimeTypes mimeTypes;
53
54 public TikaConfig(String file)
55 throws TikaException, IOException, SAXException {
56 this(new File(file));
57 }
58
59 public TikaConfig(File file)
60 throws TikaException, IOException, SAXException {
61 this(getBuilder().parse(file));
62 }
63
64 public TikaConfig(URL url)
65 throws TikaException, IOException, SAXException {
66 this(getBuilder().parse(url.toString()));
67 }
68
69 public TikaConfig(InputStream stream)
70 throws TikaException, IOException, SAXException {
71 this(getBuilder().parse(stream));
72 }
73
74
75
76
77
78 public TikaConfig(InputStream stream, Parser delegate)
79 throws TikaException, IOException, SAXException {
80 this(stream);
81 }
82
83 public TikaConfig(Document document) throws TikaException, IOException {
84 this(document.getDocumentElement());
85 }
86
87
88
89
90
91 public TikaConfig(Document document, Parser delegate)
92 throws TikaException, IOException {
93 this(document);
94 }
95
96 public TikaConfig(Element element) throws TikaException, IOException {
97 Element mtr = getChild(element, "mimeTypeRepository");
98 if (mtr != null && mtr.hasAttribute("resource")) {
99 mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
100 } else {
101 mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
102 }
103
104 NodeList nodes = element.getElementsByTagName("parser");
105 for (int i = 0; i < nodes.getLength(); i++) {
106 Element node = (Element) nodes.item(i);
107 String name = node.getAttribute("class");
108
109 try {
110 Class<?> parserClass = Class.forName(name);
111 Object instance = parserClass.newInstance();
112 if (!(instance instanceof Parser)) {
113 throw new TikaException(
114 "Configured class is not a Tika Parser: " + name);
115 }
116 Parser parser = (Parser) instance;
117
118 NodeList mimes = node.getElementsByTagName("mime");
119 if (mimes.getLength() > 0) {
120 for (int j = 0; j < mimes.getLength(); j++) {
121 parsers.put(getText(mimes.item(j)).trim(), parser);
122 }
123 } else {
124 ParseContext context = new ParseContext();
125 for (MediaType type : parser.getSupportedTypes(context)) {
126 parsers.put(type.toString(), parser);
127 }
128 }
129 } catch (ClassNotFoundException e) {
130 throw new TikaException(
131 "Configured parser class not found: " + name, e);
132 } catch (IllegalAccessException e) {
133 throw new TikaException(
134 "Unable to access a parser class: " + name, e);
135 } catch (InstantiationException e) {
136 throw new TikaException(
137 "Unable to instantiate a parser class: " + name, e);
138 }
139 }
140 }
141
142 public TikaConfig() throws MimeTypeException, IOException {
143 ParseContext context = new ParseContext();
144 Iterator<Parser> iterator =
145 ServiceRegistry.lookupProviders(Parser.class);
146 while (iterator.hasNext()) {
147 Parser parser = iterator.next();
148 for (MediaType type : parser.getSupportedTypes(context)) {
149 parsers.put(type.toString(), parser);
150 }
151 }
152 mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
153 }
154
155
156
157
158
159 public TikaConfig(Element element, Parser delegate)
160 throws TikaException, IOException {
161 this(element);
162 }
163
164 private String getText(Node node) {
165 if (node.getNodeType() == Node.TEXT_NODE) {
166 return node.getNodeValue();
167 } else if (node.getNodeType() == Node.ELEMENT_NODE) {
168 StringBuilder builder = new StringBuilder();
169 NodeList list = node.getChildNodes();
170 for (int i = 0; i < list.getLength(); i++) {
171 builder.append(getText(list.item(i)));
172 }
173 return builder.toString();
174 } else {
175 return "";
176 }
177 }
178
179
180
181
182
183
184
185
186 public Parser getParser(String mimeType) {
187 return parsers.get(mimeType);
188 }
189
190 public Map<String, Parser> getParsers() {
191 return parsers;
192 }
193
194 public MimeTypes getMimeRepository(){
195 return mimeTypes;
196 }
197
198
199
200
201
202
203
204
205 public static TikaConfig getDefaultConfig() {
206 try {
207 return new TikaConfig();
208 } catch (IOException e) {
209 throw new RuntimeException(
210 "Unable to read default configuration", e);
211 } catch (TikaException e) {
212 throw new RuntimeException(
213 "Unable to access default configuration", e);
214 }
215 }
216
217
218
219
220
221 public static TikaConfig getDefaultConfig(Parser delegate)
222 throws TikaException {
223 return getDefaultConfig();
224 }
225
226 private static DocumentBuilder getBuilder() throws TikaException {
227 try {
228 return DocumentBuilderFactory.newInstance().newDocumentBuilder();
229 } catch (ParserConfigurationException e) {
230 throw new TikaException("XML parser not available", e);
231 }
232 }
233
234 private static Element getChild(Element element, String name) {
235 Node child = element.getFirstChild();
236 while (child != null) {
237 if (child.getNodeType() == Node.ELEMENT_NODE
238 && name.equals(child.getNodeName())) {
239 return (Element) child;
240 }
241 child = child.getNextSibling();
242 }
243 return null;
244 }
245
246 }