1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.html;
18
19 import java.io.ByteArrayInputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.StringWriter;
23 import java.util.ArrayList;
24 import java.util.List;
25
26 import junit.framework.TestCase;
27
28 import org.apache.tika.Tika;
29 import org.apache.tika.exception.TikaException;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.parser.ParseContext;
32 import org.apache.tika.sax.BodyContentHandler;
33 import org.apache.tika.sax.TeeContentHandler;
34 import org.xml.sax.Attributes;
35 import org.xml.sax.ContentHandler;
36 import org.xml.sax.SAXException;
37 import org.xml.sax.helpers.DefaultHandler;
38
39 public class HtmlParserTest extends TestCase {
40
41 public void testParseAscii() throws Exception {
42 String path = "/test-documents/testHTML.html";
43 final StringWriter href = new StringWriter();
44 final StringWriter name = new StringWriter();
45 ContentHandler body = new BodyContentHandler();
46 Metadata metadata = new Metadata();
47 InputStream stream = HtmlParserTest.class.getResourceAsStream(path);
48 try {
49 ContentHandler link = new DefaultHandler() {
50 @Override
51 public void startElement(
52 String u, String l, String n, Attributes a)
53 throws SAXException {
54 if ("a".equals(l)) {
55 if (a.getValue("href") != null) {
56 href.append(a.getValue("href"));
57 } else if (a.getValue("name") != null) {
58 name.append(a.getValue("name"));
59 }
60 }
61 }
62 };
63 new HtmlParser().parse(
64 stream, new TeeContentHandler(body, link),
65 metadata, new ParseContext());
66 } finally {
67 stream.close();
68 }
69
70 assertEquals(
71 "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
72 assertEquals("Tika Developers", metadata.get("Author"));
73 assertEquals("5", metadata.get("refresh"));
74
75 assertEquals("http://www.apache.org/", href.toString());
76 assertEquals("test-anchor", name.toString());
77
78 String content = body.toString();
79 assertTrue(
80 "Did not contain expected text:" + "Test Indexation Html",
81 content.contains("Test Indexation Html"));
82 assertTrue(
83 "Did not contain expected text:" + "Indexation du fichier",
84 content.contains("Indexation du fichier"));
85 }
86
87 public void XtestParseUTF8() throws IOException, SAXException, TikaException {
88 String path = "/test-documents/testXHTML_utf8.html";
89 Metadata metadata = new Metadata();
90 String content = new Tika().parseToString(
91 HtmlParserTest.class.getResourceAsStream(path), metadata);
92
93 assertTrue("Did not contain expected text:"
94 + "Title : Tilte with UTF-8 chars öäå", content
95 .contains("Title : Tilte with UTF-8 chars öäå"));
96
97 assertTrue("Did not contain expected text:"
98 + "Content with UTF-8 chars", content
99 .contains("Content with UTF-8 chars"));
100
101 assertTrue("Did not contain expected text:" + "åäö", content
102 .contains("åäö"));
103 }
104
105 public void testXhtmlParsing() throws Exception {
106 String path = "/test-documents/testXHTML.html";
107 Metadata metadata = new Metadata();
108 String content = new Tika().parseToString(
109 HtmlParserTest.class.getResourceAsStream(path), metadata);
110
111 assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
112 assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
113
114 assertEquals("Tika Developers", metadata.get("Author"));
115 assertEquals("5", metadata.get("refresh"));
116 assertTrue(content.contains("ability of Apache Tika"));
117 assertTrue(content.contains("extract content"));
118 assertTrue(content.contains("an XHTML document"));
119 }
120
121 public void testParseEmpty() throws Exception {
122 ContentHandler handler = new BodyContentHandler();
123 new HtmlParser().parse(
124 new ByteArrayInputStream(new byte[0]),
125 handler, new Metadata(), new ParseContext());
126 assertEquals("", handler.toString());
127 }
128
129
130
131
132
133 public void testCharactersDirectlyUnderBodyElement() throws Exception {
134 String test = "<html><body>test</body></html>";
135 String content = new Tika().parseToString(
136 new ByteArrayInputStream(test.getBytes("UTF-8")));
137 assertEquals("test", content);
138 }
139
140
141
142
143
144 public void testBaseHref() throws Exception {
145 assertRelativeLink(
146 "http://lucene.apache.org/tika/",
147 "http://lucene.apache.org/", "tika/");
148
149 assertRelativeLink(
150 "http://domain.com/?pid=1",
151 "http://domain.com", "?pid=1");
152 assertRelativeLink(
153 "http://domain.com/?pid=2",
154 "http://domain.com?pid=1", "?pid=2");
155
156 assertRelativeLink(
157 "http://domain.com/file.html",
158 "http://domain.com/path/", "/file.html");
159 assertRelativeLink(
160 "http://domain.com/path/file.html",
161 "http://domain.com/path/", "./file.html");
162 assertRelativeLink(
163 "http://domain.com/path/file.html",
164 "http://domain.com/path/", "file.html");
165
166 assertRelativeLink(
167 "http://domain2.com/newpath",
168 "http://domain.com/path/to/file", "http://domain2.com/newpath");
169
170
171
172
173
174 assertRelativeLink(
175 "http://domain.com/path/?pid=1",
176 "http://domain.com/path/", "?pid=1");
177 assertRelativeLink(
178 "http://domain.com/file?pid=1",
179 "http://domain.com/file", "?pid=1");
180 assertRelativeLink(
181 "http://domain.com/path/d;p?pid=1",
182 "http://domain.com/path/d;p?q#f", "?pid=1");
183 }
184
185 private void assertRelativeLink(String url, String base, String relative)
186 throws Exception {
187 String test =
188 "<html><head><base href=\"" + base + "\"></head>"
189 + "<body><a href=\"" + relative + "\">test</a></body></html>";
190 final List<String> links = new ArrayList<String>();
191 new HtmlParser().parse(
192 new ByteArrayInputStream(test.getBytes("UTF-8")),
193 new DefaultHandler() {
194 @Override
195 public void startElement(
196 String u, String l, String name, Attributes atts) {
197 if (atts.getValue("", "href") != null) {
198 links.add(atts.getValue("", "href"));
199 }
200 }
201 },
202 new Metadata(),
203 new ParseContext());
204 assertEquals(1, links.size());
205 assertEquals(url, links.get(0));
206 }
207
208
209
210
211
212 public void testWhitespaceBetweenTableCells() throws Exception {
213 String test =
214 "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
215 String content = new Tika().parseToString(
216 new ByteArrayInputStream(test.getBytes("UTF-8")));
217 assertTrue(content.contains("a"));
218 assertTrue(content.contains("b"));
219 assertFalse(content.contains("ab"));
220 }
221
222
223
224
225
226 public void testHttpEquivCharset() throws Exception {
227 String test =
228 "<html><head><meta http-equiv=\"content-type\""
229 + " content=\"text/html; charset=ISO-8859-1\" />"
230 + "<title>the name is \u00e1ndre</title>"
231 + "</head><body></body></html>";
232 Metadata metadata = new Metadata();
233 new HtmlParser().parse (
234 new ByteArrayInputStream(test.getBytes("UTF-8")),
235 new BodyContentHandler(), metadata, new ParseContext());
236 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
237 }
238
239
240
241
242
243 public void testDetectOfCharset() throws Exception {
244 String test =
245 "<html><head><title>\u017d</title></head><body></body></html>";
246 Metadata metadata = new Metadata();
247 new HtmlParser().parse (
248 new ByteArrayInputStream(test.getBytes("UTF-8")),
249 new BodyContentHandler(), metadata, new ParseContext());
250 assertEquals("\u017d", metadata.get(Metadata.TITLE));
251 }
252
253
254
255
256
257 public void testUsingCharsetInContentTypeHeader() throws Exception {
258 final String test =
259 "<html><head><title>the name is \u00e1ndre</title></head>"
260 + "<body></body></html>";
261
262 Metadata metadata = new Metadata();
263 new HtmlParser().parse (
264 new ByteArrayInputStream(test.getBytes("UTF-8")),
265 new BodyContentHandler(), metadata, new ParseContext());
266 assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
267
268 metadata = new Metadata();
269 metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
270 new HtmlParser().parse (
271 new ByteArrayInputStream(test.getBytes("UTF-8")),
272 new BodyContentHandler(), metadata, new ParseContext());
273 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
274 }
275
276
277
278
279
280
281
282
283
284 public void testLineBreak() throws Exception {
285 String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
286 String text = new Tika().parseToString(
287 new ByteArrayInputStream(test.getBytes("US-ASCII")));
288 String[] parts = text.trim().split("\\s+");
289 assertEquals(3, parts.length);
290 assertEquals("foo", parts[0]);
291 assertEquals("bar", parts[1]);
292 assertEquals("baz", parts[2]);
293 }
294
295
296
297
298
299 public void testIgnoreCharsetDetectorLanguage() throws Exception {
300 String test = "<html><title>Simple Content</title><body></body></html>";
301 Metadata metadata = new Metadata();
302 metadata.add(Metadata.CONTENT_LANGUAGE, "en");
303 new HtmlParser().parse (
304 new ByteArrayInputStream(test.getBytes("UTF-8")),
305 new BodyContentHandler(), metadata, new ParseContext());
306
307 assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
308 }
309
310
311
312
313
314 public void testHttpEquivCharsetFunkyAttributes() throws Exception {
315 String test1 =
316 "<html><head><meta http-equiv=\"content-type\""
317 + " content=\"text/html; charset=ISO-8859-1; charset=iso-8859-1\" />"
318 + "<title>the name is \u00e1ndre</title>"
319 + "</head><body></body></html>";
320 Metadata metadata = new Metadata();
321 new HtmlParser().parse (
322 new ByteArrayInputStream(test1.getBytes("UTF-8")),
323 new BodyContentHandler(), metadata, new ParseContext());
324 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
325
326
327 String test2 =
328 "<html><head><meta http-equiv=\"content-type\""
329 + " content=\"text/html;;charset=ISO-8859-1\" />"
330 + "<title>the name is \u00e1ndre</title>"
331 + "</head><body></body></html>";
332 metadata = new Metadata();
333 new HtmlParser().parse (
334 new ByteArrayInputStream(test2.getBytes("UTF-8")),
335 new BodyContentHandler(), metadata, new ParseContext());
336 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
337 }
338
339
340
341
342
343 public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
344 final String test =
345 "<html><head><title>the name is \u00e1ndre</title></head>"
346 + "<body></body></html>";
347
348 Metadata metadata = new Metadata();
349 new HtmlParser().parse (
350 new ByteArrayInputStream(test.getBytes("UTF-8")),
351 new BodyContentHandler(), metadata, new ParseContext());
352 assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
353
354 metadata = new Metadata();
355 metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
356 new HtmlParser().parse (
357 new ByteArrayInputStream(test.getBytes("UTF-8")),
358 new BodyContentHandler(), metadata, new ParseContext());
359 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
360 }
361
362
363
364
365
366
367 public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
368 String path = "/test-documents/big-preamble.html";
369 Metadata metadata = new Metadata();
370 new HtmlParser().parse(
371 HtmlParserTest.class.getResourceAsStream(path),
372 new BodyContentHandler(), metadata, new ParseContext());
373
374 assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
375 }
376
377 }