1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.mime;
18
19 import org.apache.tika.detect.MagicDetector;
20 import org.w3c.dom.Attr;
21 import org.w3c.dom.Node;
22 import org.w3c.dom.Element;
23 import org.w3c.dom.Document;
24 import org.w3c.dom.NodeList;
25 import org.w3c.dom.NamedNodeMap;
26 import org.xml.sax.InputSource;
27 import org.xml.sax.SAXException;
28
29 import java.io.ByteArrayOutputStream;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.util.ArrayList;
33 import java.util.List;
34
35 import javax.xml.parsers.DocumentBuilder;
36 import javax.xml.parsers.DocumentBuilderFactory;
37 import javax.xml.parsers.ParserConfigurationException;
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94 final class MimeTypesReader implements MimeTypesReaderMetKeys {
95
96 private final MimeTypes types;
97
98 MimeTypesReader(MimeTypes types) {
99 this.types = types;
100 }
101
102 void read(String filepath) throws IOException, MimeTypeException {
103 read(MimeTypesReader.class.getClassLoader().getResourceAsStream(filepath));
104 }
105
106 void read(InputStream stream) throws IOException, MimeTypeException {
107 try {
108 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
109 DocumentBuilder builder = factory.newDocumentBuilder();
110 Document document = builder.parse(new InputSource(stream));
111 read(document);
112 } catch (ParserConfigurationException e) {
113 throw new MimeTypeException("Unable to create an XML parser", e);
114 } catch (SAXException e) {
115 throw new MimeTypeException("Invalid type configuration", e);
116 }
117 }
118
119 void read(Document document) throws MimeTypeException {
120 Element element = document.getDocumentElement();
121 if (element != null && element.getTagName().equals(MIME_INFO_TAG)) {
122 NodeList nodes = element.getChildNodes();
123 for (int i = 0; i < nodes.getLength(); i++) {
124 Node node = nodes.item(i);
125 if (node.getNodeType() == Node.ELEMENT_NODE) {
126 Element child = (Element) node;
127 if (child.getTagName().equals(MIME_TYPE_TAG)) {
128 readMimeType(child);
129 }
130 }
131 }
132 } else {
133 throw new MimeTypeException(
134 "Not a <" + MIME_INFO_TAG + "/> configuration document: "
135 + element.getTagName());
136 }
137 }
138
139
140 private void readMimeType(Element element) throws MimeTypeException {
141 String name = element.getAttribute(MIME_TYPE_TYPE_ATTR);
142 MimeType type = types.forName(name);
143
144 NodeList nodes = element.getChildNodes();
145 for (int i = 0; i < nodes.getLength(); i++) {
146 Node node = nodes.item(i);
147 if (node.getNodeType() == Node.ELEMENT_NODE) {
148 Element nodeElement = (Element) node;
149 if (nodeElement.getTagName().equals(COMMENT_TAG)) {
150 type.setDescription(
151 nodeElement.getFirstChild().getNodeValue());
152 } else if (nodeElement.getTagName().equals(GLOB_TAG)) {
153 boolean useRegex = Boolean.valueOf(nodeElement.getAttribute(ISREGEX_ATTR));
154 types.addPattern(type, nodeElement.getAttribute(PATTERN_ATTR), useRegex);
155 } else if (nodeElement.getTagName().equals(MAGIC_TAG)) {
156 readMagic(nodeElement, type);
157 } else if (nodeElement.getTagName().equals(ALIAS_TAG)) {
158 String alias = nodeElement.getAttribute(ALIAS_TYPE_ATTR);
159 type.addAlias(alias);
160 } else if (nodeElement.getTagName().equals(ROOT_XML_TAG)) {
161 readRootXML(nodeElement, type);
162 } else if (nodeElement.getTagName().equals(SUB_CLASS_OF_TAG)) {
163 String parent = nodeElement.getAttribute(SUB_CLASS_TYPE_ATTR);
164 type.setSuperType(types.forName(parent));
165 }
166 }
167 }
168
169 types.add(type);
170 }
171
172
173
174
175
176 private void readMagic(Element element, MimeType mimeType)
177 throws MimeTypeException {
178 int priority = 50;
179 String value = element.getAttribute(MAGIC_PRIORITY_ATTR);
180 if (value != null && value.length() > 0) {
181 priority = Integer.parseInt(value);
182 }
183
184 for (Clause clause : readMatches(element)) {
185 Magic magic = new Magic(mimeType);
186 magic.setPriority(priority);
187 magic.setClause(clause);
188 mimeType.addMagic(magic);
189 }
190 }
191
192 private List<Clause> readMatches(Element element) throws MimeTypeException {
193 List<Clause> clauses = new ArrayList<Clause>();
194 NodeList nodes = element.getChildNodes();
195 for (int i = 0; i < nodes.getLength(); i++) {
196 Node node = nodes.item(i);
197 if (node.getNodeType() == Node.ELEMENT_NODE) {
198 Element nodeElement = (Element) node;
199 if (nodeElement.getTagName().equals(MATCH_TAG)) {
200 clauses.add(readMatch(nodeElement));
201 }
202 }
203 }
204 return clauses;
205 }
206
207
208 private Clause readMatch(Element element) throws MimeTypeException {
209 String type = "string";
210 int start = 0;
211 int end = 0;
212 String value = null;
213 String mask = null;
214
215 NamedNodeMap attrs = element.getAttributes();
216 for (int i = 0; i < attrs.getLength(); i++) {
217 Attr attr = (Attr) attrs.item(i);
218 if (attr.getName().equals(MATCH_OFFSET_ATTR)) {
219 String offset = attr.getValue();
220 int colon = offset.indexOf(':');
221 if (colon == -1) {
222 start = Integer.parseInt(offset);
223 end = start;
224 } else {
225 start = Integer.parseInt(offset.substring(0, colon));
226 end = Integer.parseInt(offset.substring(colon + 1));
227 }
228 } else if (attr.getName().equals(MATCH_TYPE_ATTR)) {
229 type = attr.getValue();
230 } else if (attr.getName().equals(MATCH_VALUE_ATTR)) {
231 value = attr.getValue();
232 } else if (attr.getName().equals(MATCH_MASK_ATTR)) {
233 mask = attr.getValue();
234 }
235 }
236
237 if (value == null) {
238 throw new MimeTypeException("Missing magic byte pattern");
239 } else if (start < 0 || end < start) {
240 throw new MimeTypeException(
241 "Invalid offset range: [" + start + "," + end + "]");
242 }
243
244 byte[] patternBytes = decodeValue(type, value);
245 int length = patternBytes.length;
246 byte[] maskBytes = null;
247 if (mask != null) {
248 maskBytes = decodeValue(type, mask);
249 length = Math.max(patternBytes.length, maskBytes.length);
250 }
251
252 MagicDetector detector = new MagicDetector(
253 MediaType.TEXT_PLAIN, patternBytes, maskBytes, start, end);
254 Clause clause = new MagicMatch(detector, length);
255
256 List<Clause> subClauses = readMatches(element);
257 if (subClauses.size() == 0) {
258 return clause;
259 } else if (subClauses.size() == 1) {
260 return new AndClause(clause, subClauses.get(0));
261 } else {
262 return new AndClause(clause, new OrClause(subClauses));
263 }
264 }
265
266 private byte[] decodeValue(String type, String value)
267 throws MimeTypeException {
268
269 if ((value == null) || (type == null)) {
270 return null;
271 }
272
273 byte[] decoded = null;
274 String tmpVal = null;
275 int radix = 8;
276
277
278 if (value.startsWith("0x")) {
279 tmpVal = value.substring(2);
280 radix = 16;
281 } else {
282 tmpVal = value;
283 radix = 8;
284 }
285
286 if (type.equals("string")) {
287 decoded = decodeString(value);
288
289 } else if (type.equals("byte")) {
290 decoded = tmpVal.getBytes();
291
292 } else if (type.equals("host16") || type.equals("little16")) {
293 int i = Integer.parseInt(tmpVal, radix);
294 decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
295
296 } else if (type.equals("big16")) {
297 int i = Integer.parseInt(tmpVal, radix);
298 decoded = new byte[] { (byte) (i >> 8), (byte) (i & 0x00FF) };
299
300 } else if (type.equals("host32") || type.equals("little32")) {
301 long i = Long.parseLong(tmpVal, radix);
302 decoded = new byte[] { (byte) ((i & 0x000000FF)),
303 (byte) ((i & 0x0000FF00) >> 8),
304 (byte) ((i & 0x00FF0000) >> 16),
305 (byte) ((i & 0xFF000000) >> 24) };
306
307 } else if (type.equals("big32")) {
308 long i = Long.parseLong(tmpVal, radix);
309 decoded = new byte[] { (byte) ((i & 0xFF000000) >> 24),
310 (byte) ((i & 0x00FF0000) >> 16),
311 (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF)) };
312 }
313 return decoded;
314 }
315
316 private byte[] decodeString(String value) throws MimeTypeException {
317 if (value.startsWith("0x")) {
318 byte[] bytes = new byte[(value.length() - 2) / 2];
319 for (int i = 0; i < bytes.length; i++) {
320 bytes[i] = (byte)
321 Integer.parseInt(value.substring(2 + i * 2, 4 + i * 2), 16);
322 }
323 return bytes;
324 }
325
326 try {
327 ByteArrayOutputStream decoded = new ByteArrayOutputStream();
328
329 for (int i = 0; i < value.length(); i++) {
330 if (value.charAt(i) == '\\') {
331 if (value.charAt(i + 1) == '\\') {
332 decoded.write('\\');
333 i++;
334 } else if (value.charAt(i + 1) == 'x') {
335 decoded.write(Integer.parseInt(
336 value.substring(i + 2, i + 4), 16));
337 i += 3;
338 } else {
339 int j = i + 1;
340 while ((j < i + 4) && (j < value.length())
341 && (Character.isDigit(value.charAt(j)))) {
342 j++;
343 }
344 decoded.write(Short.decode(
345 "0" + value.substring(i + 1, j)).byteValue());
346 i = j - 1;
347 }
348 } else {
349 decoded.write(value.charAt(i));
350 }
351 }
352 return decoded.toByteArray();
353 } catch (NumberFormatException e) {
354 throw new MimeTypeException("Invalid string value: " + value, e);
355 }
356 }
357
358
359 private void readRootXML(Element element, MimeType mimeType) {
360 mimeType.addRootXML(element.getAttribute(NS_URI_ATTR), element
361 .getAttribute(LOCAL_NAME_ATTR));
362 }
363
364 }