View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.config;
18  
19  import java.io.File;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.URL;
23  import java.util.HashMap;
24  import java.util.Iterator;
25  import java.util.Map;
26  
27  import javax.imageio.spi.ServiceRegistry;
28  import javax.xml.parsers.DocumentBuilder;
29  import javax.xml.parsers.DocumentBuilderFactory;
30  import javax.xml.parsers.ParserConfigurationException;
31  
32  import org.apache.tika.exception.TikaException;
33  import org.apache.tika.mime.MediaType;
34  import org.apache.tika.mime.MimeTypeException;
35  import org.apache.tika.mime.MimeTypes;
36  import org.apache.tika.mime.MimeTypesFactory;
37  import org.apache.tika.parser.ParseContext;
38  import org.apache.tika.parser.Parser;
39  import org.w3c.dom.Document;
40  import org.w3c.dom.Element;
41  import org.w3c.dom.Node;
42  import org.w3c.dom.NodeList;
43  import org.xml.sax.SAXException;
44  
45  /**
46   * Parse xml config file.
47   */
48  public class TikaConfig {
49  
50      private final Map<String, Parser> parsers = new HashMap<String, Parser>();
51  
52      private final MimeTypes mimeTypes;
53  
54      public TikaConfig(String file)
55              throws TikaException, IOException, SAXException {
56          this(new File(file));
57      }
58  
59      public TikaConfig(File file)
60              throws TikaException, IOException, SAXException {
61          this(getBuilder().parse(file));
62      }
63  
64      public TikaConfig(URL url)
65              throws TikaException, IOException, SAXException {
66          this(getBuilder().parse(url.toString()));
67      }
68  
69      public TikaConfig(InputStream stream)
70              throws TikaException, IOException, SAXException {
71          this(getBuilder().parse(stream));
72      }
73  
74      /**
75       * @deprecated This method will be removed in Apache Tika 1.0
76       * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
77       */
78      public TikaConfig(InputStream stream, Parser delegate)
79              throws TikaException, IOException, SAXException {
80          this(stream);
81      }
82  
83      public TikaConfig(Document document) throws TikaException, IOException {
84          this(document.getDocumentElement());
85      }
86  
87      /**
88       * @deprecated This method will be removed in Apache Tika 1.0
89       * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
90       */
91      public TikaConfig(Document document, Parser delegate)
92              throws TikaException, IOException {
93          this(document);
94      }
95  
96      public TikaConfig(Element element) throws TikaException, IOException {
97          Element mtr = getChild(element, "mimeTypeRepository");
98          if (mtr != null && mtr.hasAttribute("resource")) {
99              mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
100         } else {
101             mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
102         }
103 
104         NodeList nodes = element.getElementsByTagName("parser");
105         for (int i = 0; i < nodes.getLength(); i++) {
106             Element node = (Element) nodes.item(i);
107             String name = node.getAttribute("class");
108 
109             try {
110                 Class<?> parserClass = Class.forName(name);
111                 Object instance = parserClass.newInstance();
112                 if (!(instance instanceof Parser)) {
113                     throw new TikaException(
114                             "Configured class is not a Tika Parser: " + name);
115                 }
116                 Parser parser = (Parser) instance;
117 
118                 NodeList mimes = node.getElementsByTagName("mime");
119                 if (mimes.getLength() > 0) {
120                     for (int j = 0; j < mimes.getLength(); j++) {
121                         parsers.put(getText(mimes.item(j)).trim(), parser);
122                     }
123                 } else {
124                     ParseContext context = new ParseContext();
125                     for (MediaType type : parser.getSupportedTypes(context)) {
126                         parsers.put(type.toString(), parser);
127                     }
128                 }
129             } catch (ClassNotFoundException e) {
130                 throw new TikaException(
131                         "Configured parser class not found: " + name, e);
132             } catch (IllegalAccessException e) {
133                 throw new TikaException(
134                         "Unable to access a parser class: " + name, e);
135             } catch (InstantiationException e) {
136                 throw new TikaException(
137                         "Unable to instantiate a parser class: " + name, e);
138             }
139         }
140     }
141 
142     public TikaConfig() throws MimeTypeException, IOException {
143         ParseContext context = new ParseContext();
144         Iterator<Parser> iterator =
145             ServiceRegistry.lookupProviders(Parser.class);
146         while (iterator.hasNext()) {
147             Parser parser = iterator.next();
148             for (MediaType type : parser.getSupportedTypes(context)) {
149                 parsers.put(type.toString(), parser);
150             }
151         }
152         mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
153     }
154 
155     /**
156      * @deprecated This method will be removed in Apache Tika 1.0
157      * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
158      */
159     public TikaConfig(Element element, Parser delegate)
160             throws TikaException, IOException {
161         this(element);
162     }
163 
164     private String getText(Node node) {
165         if (node.getNodeType() == Node.TEXT_NODE) {
166             return node.getNodeValue();
167         } else if (node.getNodeType() == Node.ELEMENT_NODE) {
168             StringBuilder builder = new StringBuilder();
169             NodeList list = node.getChildNodes();
170             for (int i = 0; i < list.getLength(); i++) {
171                 builder.append(getText(list.item(i)));
172             }
173             return builder.toString();
174         } else {
175             return "";
176         }
177     }
178 
179     /**
180      * Returns the parser instance configured for the given MIME type.
181      * Returns <code>null</code> if the given MIME type is unknown.
182      *
183      * @param mimeType MIME type
184      * @return configured Parser instance, or <code>null</code>
185      */
186     public Parser getParser(String mimeType) {
187         return parsers.get(mimeType);
188     }
189 
190     public Map<String, Parser> getParsers() {
191         return parsers;
192     }
193 
194     public MimeTypes getMimeRepository(){
195         return mimeTypes;
196     }
197 
198     /**
199      * Provides a default configuration (TikaConfig).  Currently creates a
200      * new instance each time it's called; we may be able to have it
201      * return a shared instance once it is completely immutable.
202      *
203      * @return default configuration
204      */
205     public static TikaConfig getDefaultConfig() {
206         try {
207             return new TikaConfig();
208         } catch (IOException e) {
209             throw new RuntimeException(
210                     "Unable to read default configuration", e);
211         } catch (TikaException e) {
212             throw new RuntimeException(
213                     "Unable to access default configuration", e);
214         }
215     }
216 
217     /**
218      * @deprecated This method will be removed in Apache Tika 1.0
219      * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
220      */
221     public static TikaConfig getDefaultConfig(Parser delegate)
222             throws TikaException {
223         return getDefaultConfig();
224     }
225 
226     private static DocumentBuilder getBuilder() throws TikaException {
227         try {
228             return DocumentBuilderFactory.newInstance().newDocumentBuilder();
229         } catch (ParserConfigurationException e) {
230             throw new TikaException("XML parser not available", e);
231         }
232     }
233 
234     private static Element getChild(Element element, String name) {
235         Node child = element.getFirstChild();
236         while (child != null) {
237             if (child.getNodeType() == Node.ELEMENT_NODE
238                     && name.equals(child.getNodeName())) {
239                 return (Element) child;
240             }
241             child = child.getNextSibling();
242         }
243         return null;
244     }
245 
246 }