View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.detect;
18  
19  import java.io.ByteArrayInputStream;
20  
21  import javax.xml.XMLConstants;
22  import javax.xml.namespace.QName;
23  import javax.xml.parsers.ParserConfigurationException;
24  import javax.xml.parsers.SAXParserFactory;
25  
26  import org.apache.tika.sax.OfflineContentHandler;
27  import org.xml.sax.Attributes;
28  import org.xml.sax.SAXException;
29  import org.xml.sax.SAXNotRecognizedException;
30  import org.xml.sax.helpers.DefaultHandler;
31  
32  /**
33   * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine
34   * the namespace URI and local name of the root element of an XML file.
35   *
36   * @since Apache Tika 0.4
37   */
38  public class XmlRootExtractor {
39  
40      private final SAXParserFactory factory;
41  
42      public XmlRootExtractor() throws SAXException, ParserConfigurationException {
43          factory = SAXParserFactory.newInstance();
44          factory.setNamespaceAware(true);
45          factory.setValidating(false);
46          try {
47              factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
48          } catch (SAXNotRecognizedException e) {
49              // TIKA-271: Some XML parsers do not support the secure-processing
50              // feature, even though it's required by JAXP in Java 5. Ignoring
51              // the exception is fine here, deployments without this feature
52              // are inherently vulnerable to XML denial-of-service attacks.
53          }
54  
55      }
56  
57      public QName extractRootElement(byte[] data) {
58          ExtractorHandler handler = new ExtractorHandler();
59          try {
60              factory.newSAXParser().parse(
61                      new ByteArrayInputStream(data),
62                      new OfflineContentHandler(handler));
63          } catch (Exception ignore) {
64          }
65          return handler.rootElement;
66      }
67  
68      private static class ExtractorHandler extends DefaultHandler {
69  
70          private QName rootElement = null;
71  
72          @Override
73          public void startElement(
74                  String uri, String local, String name, Attributes attributes)
75                  throws SAXException {
76              this.rootElement = new QName(uri, local);
77              throw new SAXException("Aborting: root element received");
78          }
79  
80      }
81  
82  }