1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.sax;
18
19 import java.util.Arrays;
20 import java.util.Collections;
21 import java.util.HashSet;
22 import java.util.Set;
23
24 import org.apache.tika.metadata.Metadata;
25 import org.xml.sax.Attributes;
26 import org.xml.sax.ContentHandler;
27 import org.xml.sax.SAXException;
28 import org.xml.sax.helpers.AttributesImpl;
29
30
31
32
33
34 public class XHTMLContentHandler extends SafeContentHandler {
35
36
37
38
39 public static final String XHTML = "http://www.w3.org/1999/xhtml";
40
41
42
43
44 private static final char[] NL = new char[] { '\n' };
45
46
47
48
49 private static final char[] TAB = new char[] { '\t' };
50
51
52
53
54 private static final Set<String> INDENT =
55 unmodifiableSet("li", "dd", "dt", "td", "th");
56
57
58
59
60 public static final Set<String> ENDLINE = unmodifiableSet(
61 "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
62 "pre", "hr", "blockquote", "address", "fieldset", "table", "form",
63 "noscript", "li", "dt", "dd", "noframes", "br", "tr");
64
65 private static Set<String> unmodifiableSet(String... elements) {
66 return Collections.unmodifiableSet(
67 new HashSet<String>(Arrays.asList(elements)));
68 }
69
70
71
72
73
74 private final Metadata metadata;
75
76
77
78
79 private boolean started = false;
80
81 public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
82 super(handler);
83 this.metadata = metadata;
84 }
85
86
87
88
89
90
91 @Override
92 public void startDocument() throws SAXException {
93 super.startDocument();
94 startPrefixMapping("", XHTML);
95 }
96
97
98
99
100
101
102
103
104
105
106
107 private void lazyStartDocument() throws SAXException {
108 if (!started) {
109 started = true;
110 startElement("html");
111 startElement("head");
112 startElement("title");
113 String title = metadata.get(Metadata.TITLE);
114 if (title != null && title.length() > 0) {
115 characters(title);
116 }
117 endElement("title");
118 endElement("head");
119 startElement("body");
120 }
121 }
122
123
124
125
126
127
128
129
130
131 @Override
132 public void endDocument() throws SAXException {
133 lazyStartDocument();
134 endElement("body");
135 endElement("html");
136 endPrefixMapping("");
137 super.endDocument();
138 }
139
140
141
142
143
144 @Override
145 public void startElement(
146 String uri, String local, String name, Attributes attributes)
147 throws SAXException {
148 lazyStartDocument();
149 if (XHTML.equals(uri) && INDENT.contains(local)) {
150 ignorableWhitespace(TAB, 0, TAB.length);
151 }
152 super.startElement(uri, local, name, attributes);
153 }
154
155
156
157
158
159 @Override
160 public void endElement(String uri, String local, String name)
161 throws SAXException {
162 super.endElement(uri, local, name);
163 if (XHTML.equals(uri) && ENDLINE.contains(local)) {
164 newline();
165 }
166 }
167
168
169
170
171 @Override
172 public void characters(char[] ch, int start, int length)
173 throws SAXException {
174 lazyStartDocument();
175 super.characters(ch, start, length);
176 }
177
178
179
180 public void startElement(String name) throws SAXException {
181 startElement(XHTML, name, name, new AttributesImpl());
182 }
183
184 public void startElement(String name, String attribute, String value)
185 throws SAXException {
186 AttributesImpl attributes = new AttributesImpl();
187 attributes.addAttribute("", attribute, attribute, "CDATA", value);
188 startElement(XHTML, name, name, attributes);
189 }
190
191 public void endElement(String name) throws SAXException {
192 endElement(XHTML, name, name);
193 }
194
195 public void characters(String characters) throws SAXException {
196 characters(characters.toCharArray(), 0, characters.length());
197 }
198
199 public void newline() throws SAXException {
200 ignorableWhitespace(NL, 0, NL.length);
201 }
202
203 public void element(String name, String value) throws SAXException {
204 startElement(name);
205 characters(value);
206 endElement(name);
207 }
208
209 }