1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.xml;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Set;
25
26 import javax.xml.XMLConstants;
27 import javax.xml.parsers.ParserConfigurationException;
28 import javax.xml.parsers.SAXParser;
29 import javax.xml.parsers.SAXParserFactory;
30
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.io.CloseShieldInputStream;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.mime.MediaType;
35 import org.apache.tika.parser.ParseContext;
36 import org.apache.tika.parser.Parser;
37 import org.apache.tika.sax.OfflineContentHandler;
38 import org.apache.tika.sax.TextContentHandler;
39 import org.apache.tika.sax.XHTMLContentHandler;
40 import org.xml.sax.ContentHandler;
41 import org.xml.sax.SAXException;
42 import org.xml.sax.SAXNotRecognizedException;
43 import org.xml.sax.SAXNotSupportedException;
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62 public class XMLParser implements Parser {
63
64 private static final Set<MediaType> SUPPORTED_TYPES =
65 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
66 MediaType.application("xml"),
67 MediaType.image("svg+xml"))));
68
69 public Set<MediaType> getSupportedTypes(ParseContext context) {
70 return SUPPORTED_TYPES;
71 }
72
73 public void parse(
74 InputStream stream, ContentHandler handler,
75 Metadata metadata, ParseContext context)
76 throws IOException, SAXException, TikaException {
77 if (metadata.get(Metadata.CONTENT_TYPE) == null) {
78 metadata.set(Metadata.CONTENT_TYPE, "application/xml");
79 }
80
81 final XHTMLContentHandler xhtml =
82 new XHTMLContentHandler(handler, metadata);
83 xhtml.startDocument();
84 xhtml.startElement("p");
85
86 getSAXParser(context).parse(
87 new CloseShieldInputStream(stream),
88 new OfflineContentHandler(
89 getContentHandler(handler, metadata)));
90
91 xhtml.endElement("p");
92 xhtml.endDocument();
93 }
94
95
96
97
98 public void parse(
99 InputStream stream, ContentHandler handler, Metadata metadata)
100 throws IOException, SAXException, TikaException {
101 parse(stream, handler, metadata, new ParseContext());
102 }
103
104 protected ContentHandler getContentHandler(
105 ContentHandler handler, Metadata metadata) {
106 return new TextContentHandler(handler);
107 }
108
109
110
111
112
113
114
115
116
117
118
119 private SAXParser getSAXParser(ParseContext context)
120 throws TikaException {
121 SAXParser parser = context.get(SAXParser.class);
122 if (parser instanceof SAXParser) {
123 return parser;
124 } else {
125 try {
126 return getSAXParserFactory(context).newSAXParser();
127 } catch (ParserConfigurationException e) {
128 throw new TikaException("Unable to configure a SAX parser", e);
129 } catch (SAXException e) {
130 throw new TikaException("Unable to create a SAX parser", e);
131 }
132 }
133 }
134
135
136
137
138
139
140
141
142
143
144 private SAXParserFactory getSAXParserFactory(ParseContext context) {
145 SAXParserFactory factory = context.get(SAXParserFactory.class);
146 if (factory != null) {
147 return factory;
148 } else {
149 return getDefaultSAXParserFactory();
150 }
151 }
152
153
154
155
156
157
158
159
160 private SAXParserFactory getDefaultSAXParserFactory() {
161 SAXParserFactory factory = SAXParserFactory.newInstance();
162 factory.setNamespaceAware(true);
163 try {
164 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
165 } catch (ParserConfigurationException e) {
166 } catch (SAXNotSupportedException e) {
167 } catch (SAXNotRecognizedException e) {
168
169
170
171
172 }
173 return factory;
174 }
175
176 }