1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.tika.detect;
18
19 import java.io.ByteArrayInputStream;
20
21 import javax.xml.XMLConstants;
22 import javax.xml.namespace.QName;
23 import javax.xml.parsers.ParserConfigurationException;
24 import javax.xml.parsers.SAXParserFactory;
25
26 import org.apache.tika.sax.OfflineContentHandler;
27 import org.xml.sax.Attributes;
28 import org.xml.sax.SAXException;
29 import org.xml.sax.SAXNotRecognizedException;
30 import org.xml.sax.helpers.DefaultHandler;
31
32 /**
33 * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine
34 * the namespace URI and local name of the root element of an XML file.
35 *
36 * @since Apache Tika 0.4
37 */
38 public class XmlRootExtractor {
39
40 private final SAXParserFactory factory;
41
42 public XmlRootExtractor() throws SAXException, ParserConfigurationException {
43 factory = SAXParserFactory.newInstance();
44 factory.setNamespaceAware(true);
45 factory.setValidating(false);
46 try {
47 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
48 } catch (SAXNotRecognizedException e) {
49 // TIKA-271: Some XML parsers do not support the secure-processing
50 // feature, even though it's required by JAXP in Java 5. Ignoring
51 // the exception is fine here, deployments without this feature
52 // are inherently vulnerable to XML denial-of-service attacks.
53 }
54
55 }
56
57 public QName extractRootElement(byte[] data) {
58 ExtractorHandler handler = new ExtractorHandler();
59 try {
60 factory.newSAXParser().parse(
61 new ByteArrayInputStream(data),
62 new OfflineContentHandler(handler));
63 } catch (Exception ignore) {
64 }
65 return handler.rootElement;
66 }
67
68 private static class ExtractorHandler extends DefaultHandler {
69
70 private QName rootElement = null;
71
72 @Override
73 public void startElement(
74 String uri, String local, String name, Attributes attributes)
75 throws SAXException {
76 this.rootElement = new QName(uri, local);
77 throw new SAXException("Aborting: root element received");
78 }
79
80 }
81
82 }