View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.mime;
18  
19  import org.apache.tika.detect.MagicDetector;
20  import org.w3c.dom.Attr;
21  import org.w3c.dom.Node;
22  import org.w3c.dom.Element;
23  import org.w3c.dom.Document;
24  import org.w3c.dom.NodeList;
25  import org.w3c.dom.NamedNodeMap;
26  import org.xml.sax.InputSource;
27  import org.xml.sax.SAXException;
28  
29  import java.io.ByteArrayOutputStream;
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.util.ArrayList;
33  import java.util.List;
34  
35  import javax.xml.parsers.DocumentBuilder;
36  import javax.xml.parsers.DocumentBuilderFactory;
37  import javax.xml.parsers.ParserConfigurationException;
38  
39  /**
40   * A reader for XML files compliant with the freedesktop MIME-info DTD.
41   * 
42   * <pre>
43   *  &lt;!DOCTYPE mime-info [
44   *    &lt;!ELEMENT mime-info (mime-type)+&gt;
45   *    &lt;!ATTLIST mime-info xmlns CDATA #FIXED &quot;http://www.freedesktop.org/standards/shared-mime-info&quot;&gt;
46   * 
47   *    &lt;!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*&gt;
48   *    &lt;!ATTLIST mime-type type CDATA #REQUIRED&gt;
49   * 
50   *    &lt;!-- a comment describing a document with the respective MIME type. Example: &quot;WMV video&quot; --&gt;
51   *    &lt;!ELEMENT comment (#PCDATA)&gt;
52   *    &lt;!ATTLIST comment xml:lang CDATA #IMPLIED&gt;
53   * 
54   *    &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;WMV&quot; --&gt;
55   *    &lt;!ELEMENT acronym (#PCDATA)&gt;
56   *    &lt;!ATTLIST acronym xml:lang CDATA #IMPLIED&gt;
57   * 
58   *    &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;Windows Media Video&quot; --&gt;
59   *    &lt;!ELEMENT expanded-acronym (#PCDATA)&gt;
60   *    &lt;!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED&gt;
61   * 
62   *    &lt;!ELEMENT glob EMPTY&gt;
63   *    &lt;!ATTLIST glob pattern CDATA #REQUIRED&gt;
64   *    &lt;!ATTLIST glob isregex CDATA #IMPLIED&gt;
65   * 
66   *    &lt;!ELEMENT magic (match)+&gt;
67   *    &lt;!ATTLIST magic priority CDATA #IMPLIED&gt;
68   * 
69   *    &lt;!ELEMENT match (match)*&gt;
70   *    &lt;!ATTLIST match offset CDATA #REQUIRED&gt;
71   *    &lt;!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED&gt;
72   *    &lt;!ATTLIST match value CDATA #REQUIRED&gt;
73   *    &lt;!ATTLIST match mask CDATA #IMPLIED&gt;
74   * 
75   *    &lt;!ELEMENT root-XML EMPTY&gt;
76   *    &lt;!ATTLIST root-XML
77   *          namespaceURI CDATA #REQUIRED
78   *          localName CDATA #REQUIRED&gt;
79   * 
80   *    &lt;!ELEMENT alias EMPTY&gt;
81   *    &lt;!ATTLIST alias
82   *          type CDATA #REQUIRED&gt;
83   * 
84   *   &lt;!ELEMENT sub-class-of EMPTY&gt;
85   *   &lt;!ATTLIST sub-class-of
86   *         type CDATA #REQUIRED&gt;
87   *  ]&gt;
88   * </pre>
89   * 
90   * 
91   * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
92   * 
93   */
94  final class MimeTypesReader implements MimeTypesReaderMetKeys {
95  
96      private final MimeTypes types;
97  
98      MimeTypesReader(MimeTypes types) {
99          this.types = types;
100     }
101 
102     void read(String filepath) throws IOException, MimeTypeException {
103         read(MimeTypesReader.class.getClassLoader().getResourceAsStream(filepath));
104     }
105 
106     void read(InputStream stream) throws IOException, MimeTypeException {
107         try {
108             DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
109             DocumentBuilder builder = factory.newDocumentBuilder();
110             Document document = builder.parse(new InputSource(stream));
111             read(document);
112         } catch (ParserConfigurationException e) {
113             throw new MimeTypeException("Unable to create an XML parser", e);
114         } catch (SAXException e) {
115             throw new MimeTypeException("Invalid type configuration", e);
116         }
117     }
118 
119     void read(Document document) throws MimeTypeException {
120         Element element = document.getDocumentElement();
121         if (element != null && element.getTagName().equals(MIME_INFO_TAG)) {
122             NodeList nodes = element.getChildNodes();
123             for (int i = 0; i < nodes.getLength(); i++) {
124                 Node node = nodes.item(i);
125                 if (node.getNodeType() == Node.ELEMENT_NODE) {
126                     Element child = (Element) node;
127                     if (child.getTagName().equals(MIME_TYPE_TAG)) {
128                         readMimeType(child);
129                     }
130                 }
131             }
132         } else {
133             throw new MimeTypeException(
134                     "Not a <" + MIME_INFO_TAG + "/> configuration document: "
135                     + element.getTagName());
136         }
137     }
138 
139     /** Read Element named mime-type. */
140     private void readMimeType(Element element) throws MimeTypeException {
141         String name = element.getAttribute(MIME_TYPE_TYPE_ATTR);
142         MimeType type = types.forName(name);
143 
144         NodeList nodes = element.getChildNodes();
145         for (int i = 0; i < nodes.getLength(); i++) {
146             Node node = nodes.item(i);
147             if (node.getNodeType() == Node.ELEMENT_NODE) {
148                 Element nodeElement = (Element) node;
149                 if (nodeElement.getTagName().equals(COMMENT_TAG)) {
150                     type.setDescription(
151                             nodeElement.getFirstChild().getNodeValue());
152                 } else if (nodeElement.getTagName().equals(GLOB_TAG)) {
153                     boolean useRegex = Boolean.valueOf(nodeElement.getAttribute(ISREGEX_ATTR));
154                     types.addPattern(type, nodeElement.getAttribute(PATTERN_ATTR), useRegex);
155                 } else if (nodeElement.getTagName().equals(MAGIC_TAG)) {
156                     readMagic(nodeElement, type);
157                 } else if (nodeElement.getTagName().equals(ALIAS_TAG)) {
158                     String alias = nodeElement.getAttribute(ALIAS_TYPE_ATTR);
159                     type.addAlias(alias);
160                 } else if (nodeElement.getTagName().equals(ROOT_XML_TAG)) {
161                     readRootXML(nodeElement, type);
162                 } else if (nodeElement.getTagName().equals(SUB_CLASS_OF_TAG)) {
163                     String parent = nodeElement.getAttribute(SUB_CLASS_TYPE_ATTR);
164                     type.setSuperType(types.forName(parent));
165                 }
166             }
167         }
168 
169         types.add(type);
170     }
171 
172     /**
173      * Read Element named magic. 
174      * @throws MimeTypeException if the configuration is invalid
175      */
176     private void readMagic(Element element, MimeType mimeType)
177             throws MimeTypeException {
178         int priority = 50;
179         String value = element.getAttribute(MAGIC_PRIORITY_ATTR);
180         if (value != null && value.length() > 0) {
181             priority = Integer.parseInt(value);
182         }
183 
184         for (Clause clause : readMatches(element)) {
185             Magic magic = new Magic(mimeType);
186             magic.setPriority(priority);
187             magic.setClause(clause);
188             mimeType.addMagic(magic);
189         }
190     }
191 
192     private List<Clause> readMatches(Element element) throws MimeTypeException {
193         List<Clause> clauses = new ArrayList<Clause>();
194         NodeList nodes = element.getChildNodes();
195         for (int i = 0; i < nodes.getLength(); i++) {
196             Node node = nodes.item(i);
197             if (node.getNodeType() == Node.ELEMENT_NODE) {
198                 Element nodeElement = (Element) node;
199                 if (nodeElement.getTagName().equals(MATCH_TAG)) {
200                     clauses.add(readMatch(nodeElement));
201                 }
202             }
203         }
204         return clauses;
205     }
206 
207     /** Read Element named match. */
208     private Clause readMatch(Element element) throws MimeTypeException {
209         String type = "string";
210         int start = 0;
211         int end = 0;
212         String value = null;
213         String mask = null;
214 
215         NamedNodeMap attrs = element.getAttributes();
216         for (int i = 0; i < attrs.getLength(); i++) {
217             Attr attr = (Attr) attrs.item(i);
218             if (attr.getName().equals(MATCH_OFFSET_ATTR)) {
219                 String offset = attr.getValue();
220                 int colon = offset.indexOf(':');
221                 if (colon == -1) {
222                     start = Integer.parseInt(offset);
223                     end = start;
224                 } else {
225                     start = Integer.parseInt(offset.substring(0, colon));
226                     end = Integer.parseInt(offset.substring(colon + 1));
227                 }
228             } else if (attr.getName().equals(MATCH_TYPE_ATTR)) {
229                 type = attr.getValue();
230             } else if (attr.getName().equals(MATCH_VALUE_ATTR)) {
231                 value = attr.getValue();
232             } else if (attr.getName().equals(MATCH_MASK_ATTR)) {
233                 mask = attr.getValue();
234             }
235         }
236 
237         if (value == null) {
238             throw new MimeTypeException("Missing magic byte pattern");
239         } else if (start < 0 || end < start) {
240             throw new MimeTypeException(
241                     "Invalid offset range: [" + start + "," + end + "]");
242         }
243 
244         byte[] patternBytes = decodeValue(type, value);
245         int length = patternBytes.length;
246         byte[] maskBytes = null;
247         if (mask != null) {
248             maskBytes = decodeValue(type, mask);
249             length = Math.max(patternBytes.length, maskBytes.length);
250         }
251 
252         MagicDetector detector = new MagicDetector(
253                 MediaType.TEXT_PLAIN, patternBytes, maskBytes, start, end);
254         Clause clause = new MagicMatch(detector, length);
255 
256         List<Clause> subClauses = readMatches(element);
257         if (subClauses.size() == 0) {
258             return clause;
259         } else if (subClauses.size() == 1) {
260             return new AndClause(clause, subClauses.get(0));
261         } else {
262             return new AndClause(clause, new OrClause(subClauses));
263         }
264     }
265 
266     private byte[] decodeValue(String type, String value)
267             throws MimeTypeException {
268         // Preliminary check
269         if ((value == null) || (type == null)) {
270             return null;
271         }
272 
273         byte[] decoded = null;
274         String tmpVal = null;
275         int radix = 8;
276 
277         // hex
278         if (value.startsWith("0x")) {
279             tmpVal = value.substring(2);
280             radix = 16;
281         } else {
282             tmpVal = value;
283             radix = 8;
284         }
285 
286         if (type.equals("string")) {
287             decoded = decodeString(value);
288 
289         } else if (type.equals("byte")) {
290             decoded = tmpVal.getBytes();
291 
292         } else if (type.equals("host16") || type.equals("little16")) {
293             int i = Integer.parseInt(tmpVal, radix);
294             decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
295 
296         } else if (type.equals("big16")) {
297             int i = Integer.parseInt(tmpVal, radix);
298             decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
299 
300         } else if (type.equals("host32") || type.equals("little32")) {
301             long i = Long.parseLong(tmpVal, radix);
302             decoded = new byte[] { (byte) ((i & 0x000000FF)),
303                     (byte) ((i & 0x0000FF00) >> 8),
304                     (byte) ((i & 0x00FF0000) >> 16),
305                     (byte) ((i & 0xFF000000) >> 24) };
306 
307         } else if (type.equals("big32")) {
308             long i = Long.parseLong(tmpVal, radix);
309             decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24),
310                     (byte) ((i & 0x00FF0000) >> 16),
311                     (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) };
312         }
313         return decoded;
314     }
315 
316     private byte[] decodeString(String value) throws MimeTypeException {
317         if (value.startsWith("0x")) {
318             byte[] bytes = new byte[(value.length() - 2) / 2];
319             for (int i = 0; i < bytes.length; i++) {
320                 bytes[i] = (byte)
321                 Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
322             }
323             return bytes;
324         }
325 
326         try {
327             ByteArrayOutputStream decoded = new ByteArrayOutputStream();
328 
329             for (int i = 0; i < value.length(); i++) {
330                 if (value.charAt(i) == '\\') {
331                     if (value.charAt(i + 1) == '\\') {
332                         decoded.write('\\');
333                         i++;
334                     } else if (value.charAt(i + 1) == 'x') {
335                         decoded.write(Integer.parseInt(
336                                 value.substring(i + 2, i + 4), 16));
337                         i += 3;
338                     } else {
339                         int j = i + 1;
340                         while ((j < i + 4) && (j < value.length())
341                                 && (Character.isDigit(value.charAt(j)))) {
342                             j++;
343                         }
344                         decoded.write(Short.decode(
345                                 "0" + value.substring(i + 1, j)).byteValue());
346                         i = j - 1;
347                     }
348                 } else {
349                     decoded.write(value.charAt(i));
350                 }
351             }
352             return decoded.toByteArray();
353         } catch (NumberFormatException e) {
354             throw new MimeTypeException("Invalid string value: " + value, e);
355         }
356     }
357 
358     /** Read Element named root-XML. */
359     private void readRootXML(Element element, MimeType mimeType) {
360         mimeType.addRootXML(element.getAttribute(NS_URI_ATTR), element
361                 .getAttribute(LOCAL_NAME_ATTR));
362     }
363 
364 }