1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.odf;
18
19 import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
20
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.BitSet;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.Set;
27 import java.util.Stack;
28
29 import javax.xml.XMLConstants;
30 import javax.xml.namespace.QName;
31 import javax.xml.parsers.ParserConfigurationException;
32 import javax.xml.parsers.SAXParser;
33 import javax.xml.parsers.SAXParserFactory;
34
35 import org.apache.tika.exception.TikaException;
36 import org.apache.tika.io.CloseShieldInputStream;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.mime.MediaType;
39 import org.apache.tika.parser.ParseContext;
40 import org.apache.tika.parser.Parser;
41 import org.apache.tika.sax.ElementMappingContentHandler;
42 import org.apache.tika.sax.OfflineContentHandler;
43 import org.apache.tika.sax.XHTMLContentHandler;
44 import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
45 import org.xml.sax.Attributes;
46 import org.xml.sax.ContentHandler;
47 import org.xml.sax.SAXException;
48 import org.xml.sax.SAXNotRecognizedException;
49 import org.xml.sax.helpers.DefaultHandler;
50
51
52
53
54 public class OpenDocumentContentParser implements Parser {
55
56 public static final String TEXT_NS =
57 "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
58
59 public static final String TABLE_NS =
60 "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
61
62 public static final String OFFICE_NS =
63 "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
64
65 public static final String SVG_NS =
66 "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
67
68 public static final String PRESENTATION_NS =
69 "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
70
71 public static final String DRAW_NS =
72 "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
73
74 public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
75
76 protected static final char[] TAB = new char[] { '\t' };
77
78
79
80
81
82
83 private static final HashMap<QName, TargetElement> MAPPINGS =
84 new HashMap<QName, TargetElement>();
85
86 static {
87
88 MAPPINGS.put(
89 new QName(TEXT_NS, "p"),
90 new TargetElement(XHTML, "p"));
91
92 MAPPINGS.put(
93 new QName(TEXT_NS, "line-break"),
94 new TargetElement(XHTML, "br"));
95 MAPPINGS.put(
96 new QName(TEXT_NS, "list"),
97 new TargetElement(XHTML, "ul"));
98 MAPPINGS.put(
99 new QName(TEXT_NS, "list-item"),
100 new TargetElement(XHTML, "li"));
101 MAPPINGS.put(
102 new QName(TEXT_NS, "note"),
103 new TargetElement(XHTML, "div"));
104 MAPPINGS.put(
105 new QName(OFFICE_NS, "annotation"),
106 new TargetElement(XHTML, "div"));
107 MAPPINGS.put(
108 new QName(PRESENTATION_NS, "notes"),
109 new TargetElement(XHTML, "div"));
110 MAPPINGS.put(
111 new QName(DRAW_NS, "object"),
112 new TargetElement(XHTML, "object"));
113 MAPPINGS.put(
114 new QName(DRAW_NS, "text-box"),
115 new TargetElement(XHTML, "div"));
116 MAPPINGS.put(
117 new QName(SVG_NS, "title"),
118 new TargetElement(XHTML, "span"));
119 MAPPINGS.put(
120 new QName(SVG_NS, "desc"),
121 new TargetElement(XHTML, "span"));
122 MAPPINGS.put(
123 new QName(TEXT_NS, "span"),
124 new TargetElement(XHTML, "span"));
125
126 final HashMap<QName,QName> aAttsMapping =
127 new HashMap<QName,QName>();
128 aAttsMapping.put(
129 new QName(XLINK_NS, "href"),
130 new QName("href"));
131 aAttsMapping.put(
132 new QName(XLINK_NS, "title"),
133 new QName("title"));
134 MAPPINGS.put(
135 new QName(TEXT_NS, "a"),
136 new TargetElement(XHTML, "a", aAttsMapping));
137
138
139 MAPPINGS.put(
140 new QName(TABLE_NS, "table"),
141 new TargetElement(XHTML, "table"));
142
143 MAPPINGS.put(
144 new QName(TABLE_NS, "table-row"),
145 new TargetElement(XHTML, "tr"));
146
147 final HashMap<QName,QName> tableCellAttsMapping =
148 new HashMap<QName,QName>();
149 tableCellAttsMapping.put(
150 new QName(TABLE_NS, "number-columns-spanned"),
151 new QName("colspan"));
152 tableCellAttsMapping.put(
153 new QName(TABLE_NS, "number-rows-spanned"),
154 new QName("rowspan"));
155
156
157
158
159
160
161 tableCellAttsMapping.put(
162 new QName(TABLE_NS, "number-columns-repeated"),
163 new QName("colspan"));
164 MAPPINGS.put(
165 new QName(TABLE_NS, "table-cell"),
166 new TargetElement(XHTML, "td", tableCellAttsMapping));
167 }
168
169 public Set<MediaType> getSupportedTypes(ParseContext context) {
170 return Collections.emptySet();
171 }
172
173 public void parse(
174 InputStream stream, ContentHandler handler,
175 Metadata metadata, ParseContext context)
176 throws IOException, SAXException, TikaException {
177 final XHTMLContentHandler xhtml =
178 new XHTMLContentHandler(handler,metadata);
179 DefaultHandler dh = new ElementMappingContentHandler(xhtml, MAPPINGS) {
180
181 private final BitSet textNodeStack = new BitSet();
182
183 private int nodeDepth = 0;
184
185 private int completelyFiltered = 0;
186
187 private Stack<String> headingStack = new Stack<String>();
188
189 @Override
190 public void characters(char[] ch, int start, int length)
191 throws SAXException {
192
193 if (completelyFiltered == 0 && nodeDepth > 0
194 && textNodeStack.get(nodeDepth - 1)) {
195 super.characters(ch,start,length);
196 }
197 }
198
199
200
201 private boolean needsCompleteFiltering(
202 String namespaceURI, String localName) {
203 if (TEXT_NS.equals(namespaceURI)) {
204 return localName.endsWith("-template")
205 || localName.endsWith("-style");
206 } else if (TABLE_NS.equals(namespaceURI)) {
207 return "covered-table-cell".equals(localName);
208 } else {
209 return false;
210 }
211 }
212
213
214 private String getXHTMLHeaderTagName(Attributes atts) {
215 String depthStr = atts.getValue(TEXT_NS, "outline-level");
216 if (depthStr == null) {
217 return "h1";
218 }
219
220 int depth = Integer.parseInt(depthStr);
221 if (depth >= 6) {
222 return "h6";
223 } else if (depth <= 1) {
224 return "h1";
225 } else {
226 return "h" + depth;
227 }
228 }
229
230
231
232
233 private boolean isTextNode(String namespaceURI, String localName) {
234 if (TEXT_NS.equals(namespaceURI)) {
235 return true;
236 }
237 if (SVG_NS.equals(namespaceURI)) {
238 return "title".equals(localName) ||
239 "desc".equals(localName);
240 }
241 return false;
242 }
243
244 @Override
245 public void startElement(
246 String namespaceURI, String localName, String qName,
247 Attributes atts) throws SAXException {
248
249
250
251
252
253 assert nodeDepth >= 0;
254
255 textNodeStack.set(nodeDepth++,
256 isTextNode(namespaceURI, localName));
257
258 assert completelyFiltered >= 0;
259
260 if (needsCompleteFiltering(namespaceURI, localName)) {
261 completelyFiltered++;
262 }
263
264 if (completelyFiltered == 0) {
265
266
267 if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
268 xhtml.startElement(headingStack.push(
269 getXHTMLHeaderTagName(atts)));
270 } else {
271 super.startElement(
272 namespaceURI, localName, qName, atts);
273 }
274 }
275 }
276
277 @Override
278 public void endElement(
279 String namespaceURI, String localName, String qName)
280 throws SAXException {
281
282 if (completelyFiltered == 0) {
283
284
285 if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
286 xhtml.endElement(headingStack.pop());
287 } else {
288 super.endElement(namespaceURI,localName,qName);
289 }
290
291
292 if (TEXT_NS.equals(namespaceURI)
293 && ("tab-stop".equals(localName)
294 || "tab".equals(localName))) {
295 this.characters(TAB, 0, TAB.length);
296 }
297 }
298
299
300 if (needsCompleteFiltering(namespaceURI,localName)) {
301 completelyFiltered--;
302 }
303 assert completelyFiltered >= 0;
304
305
306 nodeDepth--;
307 assert nodeDepth >= 0;
308 }
309
310 @Override
311 public void startPrefixMapping(String prefix, String uri) {
312
313 }
314
315 @Override
316 public void endPrefixMapping(String prefix) {
317
318 }
319
320 };
321
322 try {
323 SAXParserFactory factory = SAXParserFactory.newInstance();
324 factory.setValidating(false);
325 factory.setNamespaceAware(true);
326 try {
327 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
328 } catch (SAXNotRecognizedException e){
329
330
331
332
333 }
334 SAXParser parser = factory.newSAXParser();
335 parser.parse(
336 new CloseShieldInputStream(stream),
337 new OfflineContentHandler(
338 new NSNormalizerContentHandler(dh)));
339 } catch (ParserConfigurationException e) {
340 throw new TikaException("XML parser configuration error", e);
341 }
342 }
343
344
345
346
347 public void parse(
348 InputStream stream, ContentHandler handler, Metadata metadata)
349 throws IOException, SAXException, TikaException {
350 parse(stream, handler, metadata, new ParseContext());
351 }
352
353 }