View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.sax;
18  
19  import java.util.Arrays;
20  import java.util.Collections;
21  import java.util.HashSet;
22  import java.util.Set;
23  
24  import org.apache.tika.metadata.Metadata;
25  import org.xml.sax.Attributes;
26  import org.xml.sax.ContentHandler;
27  import org.xml.sax.SAXException;
28  import org.xml.sax.helpers.AttributesImpl;
29  
30  /**
31   * Content handler decorator that simplifies the task of producing XHTML
32   * events for Tika content parsers.
33   */
34  public class XHTMLContentHandler extends SafeContentHandler {
35  
36      /**
37       * The XHTML namespace URI
38       */
39      public static final String XHTML = "http://www.w3.org/1999/xhtml";
40  
41      /**
42       * The newline character that gets inserted after block elements.
43       */
44      private static final char[] NL = new char[] { '\n' };
45  
46      /**
47       * The tab character gets inserted before table cells and list items.
48       */
49      private static final char[] TAB = new char[] { '\t' };
50  
51      /**
52       * The elements that get prepended with the {@link #TAB} character.
53       */
54      private static final Set<String> INDENT =
55          unmodifiableSet("li", "dd", "dt", "td", "th");
56  
57      /**
58       * The elements that get appended with the {@link #NL} character.
59       */
60      public static final Set<String> ENDLINE = unmodifiableSet(
61              "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
62              "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
63              "noscript", "li", "dt", "dd", "noframes", "br", "tr");
64  
65      private static Set<String> unmodifiableSet(String... elements) {
66          return Collections.unmodifiableSet(
67                  new HashSet<String>(Arrays.asList(elements)));
68      }
69  
70      /**
71       * Metadata associated with the document. Used to fill in the
72       * &lt;head/&gt; section.
73       */
74      private final Metadata metadata;
75  
76      /**
77       * Flag to indicate whether the document element has been started.
78       */
79      private boolean started = false;
80  
81      public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
82          super(handler);
83          this.metadata = metadata;
84      }
85  
86      /**
87       * Starts an XHTML document by setting up the namespace mappings.
88       * The standard XHTML prefix is generated lazily when the first
89       * element is started.
90       */
91      @Override
92      public void startDocument() throws SAXException {
93          super.startDocument();
94          startPrefixMapping("", XHTML);
95      }
96  
97      /**
98       * Generates the following XHTML prefix when called for the first time:
99       * <pre>
100      * &lt;html&gt;
101      *   &lt;head&gt;
102      *     &lt;title&gt;...&lt;/title&gt;
103      *   &lt;/head&gt;
104      *   &lt;body&gt;
105      * </pre>
106      */
107     private void lazyStartDocument() throws SAXException {
108         if (!started) {
109             started = true;
110             startElement("html");
111             startElement("head");
112             startElement("title");
113             String title = metadata.get(Metadata.TITLE);
114             if (title != null && title.length() > 0) {
115                 characters(title);
116             }
117             endElement("title");
118             endElement("head");
119             startElement("body");
120         }
121     }
122 
123     /**
124      * Ends the XHTML document by writing the following footer and
125      * clearing the namespace mappings:
126      * <pre>
127      *   &lt;/body&gt;
128      * &lt;/html&gt;
129      * </pre>
130      */
131     @Override
132     public void endDocument() throws SAXException {
133         lazyStartDocument();
134         endElement("body");
135         endElement("html");
136         endPrefixMapping("");
137         super.endDocument();
138     }
139 
140     /**
141      * Starts the given element. Table cells and list items are automatically
142      * indented by emitting a tab character as ignorable whitespace.
143      */
144     @Override
145     public void startElement(
146             String uri, String local, String name, Attributes attributes)
147             throws SAXException {
148         lazyStartDocument();
149         if (XHTML.equals(uri) && INDENT.contains(local)) {
150             ignorableWhitespace(TAB, 0, TAB.length);
151         }
152         super.startElement(uri, local, name, attributes);
153     }
154 
155     /**
156      * Ends the given element. Block elements are automatically followed
157      * by a newline character.
158      */
159     @Override
160     public void endElement(String uri, String local, String name)
161             throws SAXException {
162         super.endElement(uri, local, name);
163         if (XHTML.equals(uri) && ENDLINE.contains(local)) {
164             newline();
165         }
166     }
167 
168     /**
169      * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
170      */
171     @Override
172     public void characters(char[] ch, int start, int length)
173             throws SAXException {
174         lazyStartDocument();
175         super.characters(ch, start, length);
176     }
177 
178     //------------------------------------------< public convenience methods >
179 
180     public void startElement(String name) throws SAXException {
181         startElement(XHTML, name, name, new AttributesImpl());
182     }
183 
184     public void startElement(String name, String attribute, String value)
185             throws SAXException {
186         AttributesImpl attributes = new AttributesImpl();
187         attributes.addAttribute("", attribute, attribute, "CDATA", value);
188         startElement(XHTML, name, name, attributes);
189     }
190 
191     public void endElement(String name) throws SAXException {
192         endElement(XHTML, name, name);
193     }
194 
195     public void characters(String characters) throws SAXException {
196         characters(characters.toCharArray(), 0, characters.length());
197     }
198 
199     public void newline() throws SAXException {
200         ignorableWhitespace(NL, 0, NL.length);
201     }
202 
203     public void element(String name, String value) throws SAXException {
204         startElement(name);
205         characters(value);
206         endElement(name);
207     }
208 
209 }