1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.html;
18
19 import java.io.BufferedInputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.InputStreamReader;
23 import java.nio.charset.Charset;
24 import java.util.Arrays;
25 import java.util.Collections;
26 import java.util.HashSet;
27 import java.util.Set;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30
31 import org.apache.tika.exception.TikaException;
32 import org.apache.tika.io.CloseShieldInputStream;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.mime.MediaType;
35 import org.apache.tika.parser.ParseContext;
36 import org.apache.tika.parser.Parser;
37 import org.apache.tika.parser.txt.CharsetDetector;
38 import org.apache.tika.parser.txt.CharsetMatch;
39 import org.xml.sax.ContentHandler;
40 import org.xml.sax.InputSource;
41 import org.xml.sax.SAXException;
42
43
44
45
46
47
48 public class HtmlParser implements Parser {
49
50 private static final Set<MediaType> SUPPORTED_TYPES =
51 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
52 MediaType.text("html"),
53 MediaType.application("xhtml+xml"),
54 MediaType.application("vnd.wap.xhtml+xml"),
55 MediaType.application("x-asp"))));
56
57
58
59
60 private static final HtmlMapper mapper = new DefaultHtmlMapper();
61
62
63 private static final String DEFAULT_CHARSET = "windows-1252";
64
65 private static final int META_TAG_BUFFER_SIZE = 8192;
66 private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile(
67 "(?is)<meta\\s+http-equiv\\s*=\\s*['\\\"]\\s*" +
68 "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
69 "([^'\\\"]+)['\\\"]");
70
71 public Set<MediaType> getSupportedTypes(ParseContext context) {
72 return SUPPORTED_TYPES;
73 }
74
75
76
77
78
79
80
81 private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
82 stream.mark(META_TAG_BUFFER_SIZE);
83 char[] buffer = new char[META_TAG_BUFFER_SIZE];
84 InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
85 int bufferSize = isr.read(buffer);
86 stream.reset();
87
88 if (bufferSize != -1) {
89 String metaString = new String(buffer, 0, bufferSize);
90 Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
91 if (m.find()) {
92
93
94 String[] attrs = m.group(1).split(";");
95 for (String attr : attrs) {
96 String[] keyValue = attr.trim().split("=");
97 if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
98 String charset = keyValue[1];
99 if (Charset.isSupported(charset)) {
100 metadata.set(Metadata.CONTENT_ENCODING, charset);
101 return charset;
102 }
103 }
104 }
105 }
106 }
107
108
109
110 CharsetDetector detector = new CharsetDetector();
111 String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
112 String incomingType = metadata.get(Metadata.CONTENT_TYPE);
113 if (incomingCharset == null && incomingType != null) {
114
115 MediaType mt = MediaType.parse(incomingType);
116 if (mt != null) {
117 String charset = mt.getParameters().get("charset");
118 if ((charset != null) && Charset.isSupported(charset)) {
119 incomingCharset = charset;
120 }
121 }
122 }
123
124 if (incomingCharset != null) {
125 detector.setDeclaredEncoding(incomingCharset);
126 }
127
128
129
130 detector.enableInputFilter(true);
131 detector.setText(stream);
132 for (CharsetMatch match : detector.detectAll()) {
133 if (Charset.isSupported(match.getName())) {
134 metadata.set(Metadata.CONTENT_ENCODING, match.getName());
135
136
137
138
139
140
141
142
143
144
145
146
147
148 break;
149 }
150 }
151
152 String encoding = metadata.get(Metadata.CONTENT_ENCODING);
153 if (encoding == null) {
154 if (Charset.isSupported(DEFAULT_CHARSET)) {
155 encoding = DEFAULT_CHARSET;
156 } else {
157 encoding = Charset.defaultCharset().name();
158 }
159
160 metadata.set(Metadata.CONTENT_ENCODING, encoding);
161 }
162
163 return encoding;
164 }
165
166 public void parse(
167 InputStream stream, ContentHandler handler,
168 Metadata metadata, ParseContext context)
169 throws IOException, SAXException, TikaException {
170
171 if (!stream.markSupported()) {
172 stream = new BufferedInputStream(stream);
173 }
174
175
176
177 stream = new CloseShieldInputStream(stream);
178
179
180 InputSource source = new InputSource(stream);
181 source.setEncoding(getEncoding(stream, metadata));
182
183
184 HtmlMapper mapper =
185 context.get(HtmlMapper.class, new HtmlParserMapper());
186
187
188 org.ccil.cowan.tagsoup.Parser parser =
189 new org.ccil.cowan.tagsoup.Parser();
190 parser.setContentHandler(new XHTMLDowngradeHandler(
191 new HtmlHandler(mapper, handler, metadata)));
192 parser.parse(source);
193 }
194
195
196
197
198 public void parse(
199 InputStream stream, ContentHandler handler, Metadata metadata)
200 throws IOException, SAXException, TikaException {
201 parse(stream, handler, metadata, new ParseContext());
202 }
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221 protected String mapSafeElement(String name) {
222 return mapper.mapSafeElement(name);
223 }
224
225
226
227
228
229
230
231
232
233
234
235
236
237 protected boolean isDiscardElement(String name) {
238 return "STYLE".equals(name) || "SCRIPT".equals(name);
239 }
240
241
242
243
244
245
246
247
248
249
250 private class HtmlParserMapper implements HtmlMapper {
251 public String mapSafeElement(String name) {
252 return HtmlParser.this.mapSafeElement(name);
253 }
254 public boolean isDiscardElement(String name) {
255 return HtmlParser.this.isDiscardElement(name);
256 }
257 }
258
259 }