View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.html;
18  
19  import javax.xml.XMLConstants;
20  
21  import org.apache.tika.sax.ContentHandlerDecorator;
22  import org.xml.sax.Attributes;
23  import org.xml.sax.ContentHandler;
24  import org.xml.sax.SAXException;
25  import org.xml.sax.helpers.AttributesImpl;
26  
27  /**
28   * Content handler decorator that downgrades XHTML elements to
29   * old-style HTML elements before passing them on to the decorated
30   * content handler. This downgrading consists of dropping all namespaces
31   * (and namespaced attributes) and uppercasing all element names.
32   * Used by the {@link HtmlParser} to make all incoming HTML look the same.
33   */
34  class XHTMLDowngradeHandler extends ContentHandlerDecorator {
35  
36      public XHTMLDowngradeHandler(ContentHandler handler) {
37          super(handler);
38      }
39  
40      @Override
41      public void startElement(
42              String uri, String localName, String name, Attributes atts)
43              throws SAXException {
44          String upper = localName.toUpperCase();
45  
46          AttributesImpl attributes = new AttributesImpl();
47          for (int i = 0; i < atts.getLength(); i++) {
48              String local = atts.getLocalName(i);
49              String qname = atts.getQName(i);
50              if (!XMLConstants.NULL_NS_URI.equals(atts.getURI(i).length())
51                      && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
52                      && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
53                  attributes.addAttribute(
54                          atts.getURI(i), local, qname,
55                          atts.getType(i), atts.getValue(i));
56              }
57          }
58  
59          super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
60      }
61  
62      @Override
63      public void endElement(String uri, String localName, String name)
64              throws SAXException {
65          String upper = localName.toUpperCase();
66          super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
67      }
68  
69      @Override
70      public void startPrefixMapping(String prefix, String uri) {
71      }
72  
73      @Override
74      public void endPrefixMapping(String prefix) {
75      }
76  
77  }