1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.html;
18  
19  import java.io.ByteArrayInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.StringWriter;
23  import java.util.ArrayList;
24  import java.util.List;
25  
26  import junit.framework.TestCase;
27  
28  import org.apache.tika.Tika;
29  import org.apache.tika.exception.TikaException;
30  import org.apache.tika.metadata.Metadata;
31  import org.apache.tika.parser.ParseContext;
32  import org.apache.tika.sax.BodyContentHandler;
33  import org.apache.tika.sax.TeeContentHandler;
34  import org.xml.sax.Attributes;
35  import org.xml.sax.ContentHandler;
36  import org.xml.sax.SAXException;
37  import org.xml.sax.helpers.DefaultHandler;
38  
39  public class HtmlParserTest extends TestCase {
40  
41      public void testParseAscii() throws Exception {
42          String path = "/test-documents/testHTML.html";
43          final StringWriter href = new StringWriter();
44          final StringWriter name = new StringWriter();
45          ContentHandler body = new BodyContentHandler();
46          Metadata metadata = new Metadata();
47          InputStream stream = HtmlParserTest.class.getResourceAsStream(path);
48          try {
49              ContentHandler link = new DefaultHandler() {
50                  @Override
51                  public void startElement(
52                          String u, String l, String n, Attributes a)
53                          throws SAXException {
54                      if ("a".equals(l)) {
55                          if (a.getValue("href") != null) {
56                              href.append(a.getValue("href"));
57                          } else if (a.getValue("name") != null) {
58                              name.append(a.getValue("name"));
59                          }
60                      }
61                  }
62              };
63              new HtmlParser().parse(
64                      stream, new TeeContentHandler(body, link),
65                      metadata, new ParseContext());
66          } finally {
67              stream.close();
68          }
69  
70          assertEquals(
71                  "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
72          assertEquals("Tika Developers", metadata.get("Author"));
73          assertEquals("5", metadata.get("refresh"));
74  
75          assertEquals("http://www.apache.org/", href.toString());
76          assertEquals("test-anchor", name.toString());
77  
78          String content = body.toString();
79          assertTrue(
80                  "Did not contain expected text:" + "Test Indexation Html",
81                  content.contains("Test Indexation Html"));
82          assertTrue(
83                  "Did not contain expected text:" + "Indexation du fichier",
84                  content.contains("Indexation du fichier"));
85      }
86  
87      public void XtestParseUTF8() throws IOException, SAXException, TikaException {
88          String path = "/test-documents/testXHTML_utf8.html";
89          Metadata metadata = new Metadata();
90          String content = new Tika().parseToString(
91                  HtmlParserTest.class.getResourceAsStream(path), metadata);
92  
93          assertTrue("Did not contain expected text:"
94                  + "Title : Tilte with UTF-8 chars öäå", content
95                  .contains("Title : Tilte with UTF-8 chars öäå"));
96  
97          assertTrue("Did not contain expected text:"
98                  + "Content with UTF-8 chars", content
99                  .contains("Content with UTF-8 chars"));
100 
101         assertTrue("Did not contain expected text:" + "åäö", content
102                 .contains("åäö"));
103     }
104 
105     public void testXhtmlParsing() throws Exception {
106         String path = "/test-documents/testXHTML.html";
107         Metadata metadata = new Metadata();
108         String content = new Tika().parseToString(
109                 HtmlParserTest.class.getResourceAsStream(path), metadata);
110 
111         assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
112         assertEquals("XHTML test document", metadata.get(Metadata.TITLE));
113 
114         assertEquals("Tika Developers", metadata.get("Author"));
115         assertEquals("5", metadata.get("refresh"));
116         assertTrue(content.contains("ability of Apache Tika"));
117         assertTrue(content.contains("extract content"));
118         assertTrue(content.contains("an XHTML document"));
119     }
120 
121     public void testParseEmpty() throws Exception {
122         ContentHandler handler = new BodyContentHandler();
123         new HtmlParser().parse(
124                 new ByteArrayInputStream(new byte[0]),
125                 handler,  new Metadata(), new ParseContext());
126         assertEquals("", handler.toString());
127     }
128 
129     /**
130      * Test case for TIKA-210
131      * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
132      */
133     public void testCharactersDirectlyUnderBodyElement() throws Exception {
134         String test = "<html><body>test</body></html>";
135         String content = new Tika().parseToString(
136                 new ByteArrayInputStream(test.getBytes("UTF-8")));
137         assertEquals("test", content);
138     }
139 
140     /**
141      * Test case for TIKA-287
142      * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a>
143      */
144     public void testBaseHref() throws Exception {
145         assertRelativeLink(
146                 "http://lucene.apache.org/tika/",
147                 "http://lucene.apache.org/", "tika/");
148 
149         assertRelativeLink(
150                 "http://domain.com/?pid=1",
151                 "http://domain.com", "?pid=1");
152         assertRelativeLink(
153                 "http://domain.com/?pid=2",
154                 "http://domain.com?pid=1", "?pid=2");
155 
156         assertRelativeLink(
157                 "http://domain.com/file.html",
158                 "http://domain.com/path/", "/file.html");
159         assertRelativeLink(
160                 "http://domain.com/path/file.html",
161                 "http://domain.com/path/", "./file.html");
162         assertRelativeLink(
163                 "http://domain.com/path/file.html",
164                 "http://domain.com/path/", "file.html");
165 
166         assertRelativeLink(
167                 "http://domain2.com/newpath",
168                 "http://domain.com/path/to/file", "http://domain2.com/newpath");
169 
170         // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
171         // Also http://www.ietf.org/rfc/rfc3986.txt
172         // Also http://issues.apache.org/jira/browse/NUTCH-566
173         // Also http://issues.apache.org/jira/browse/NUTCH-436
174         assertRelativeLink(
175                 "http://domain.com/path/?pid=1",
176                 "http://domain.com/path/", "?pid=1");
177         assertRelativeLink(
178                 "http://domain.com/file?pid=1",
179                 "http://domain.com/file", "?pid=1");
180         assertRelativeLink(
181                 "http://domain.com/path/d;p?pid=1",
182                 "http://domain.com/path/d;p?q#f", "?pid=1");
183     }
184 
185     private void assertRelativeLink(String url, String base, String relative)
186             throws Exception {
187         String test =
188             "<html><head><base href=\"" + base + "\"></head>"
189             + "<body><a href=\"" + relative + "\">test</a></body></html>";
190         final List<String> links = new ArrayList<String>();
191         new HtmlParser().parse(
192                 new ByteArrayInputStream(test.getBytes("UTF-8")),
193                 new DefaultHandler() {
194                     @Override
195                     public void startElement(
196                             String u, String l, String name, Attributes atts) {
197                         if (atts.getValue("", "href") != null) {
198                             links.add(atts.getValue("", "href"));
199                         }
200                     }
201                 },
202                 new Metadata(),
203                 new ParseContext());
204         assertEquals(1, links.size());
205         assertEquals(url, links.get(0));
206     }
207 
208     /**
209      * Test case for TIKA-268
210      * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
211      */
212     public void testWhitespaceBetweenTableCells() throws Exception {
213         String test =
214             "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
215         String content = new Tika().parseToString(
216                 new ByteArrayInputStream(test.getBytes("UTF-8")));
217         assertTrue(content.contains("a"));
218         assertTrue(content.contains("b"));
219         assertFalse(content.contains("ab"));
220     }
221 
222     /**
223      * Test case for TIKA-332
224      * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
225      */
226     public void testHttpEquivCharset() throws Exception {
227         String test =
228             "<html><head><meta http-equiv=\"content-type\""
229             + " content=\"text/html; charset=ISO-8859-1\" />"
230             + "<title>the name is \u00e1ndre</title>"
231             + "</head><body></body></html>";
232         Metadata metadata = new Metadata();
233         new HtmlParser().parse (
234                 new ByteArrayInputStream(test.getBytes("UTF-8")),
235                 new BodyContentHandler(),  metadata, new ParseContext());
236         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
237     }
238 
239     /**
240      * Test case for TIKA-334
241      * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
242      */
243     public void testDetectOfCharset() throws Exception {
244         String test =
245             "<html><head><title>\u017d</title></head><body></body></html>";
246         Metadata metadata = new Metadata();
247         new HtmlParser().parse (
248                 new ByteArrayInputStream(test.getBytes("UTF-8")),
249                 new BodyContentHandler(),  metadata, new ParseContext());
250         assertEquals("\u017d", metadata.get(Metadata.TITLE));
251     }
252 
253     /**
254      * Test case for TIKA-341
255      * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
256      */
257     public void testUsingCharsetInContentTypeHeader() throws Exception {
258         final String test =
259             "<html><head><title>the name is \u00e1ndre</title></head>"
260             + "<body></body></html>";
261 
262         Metadata metadata = new Metadata();
263         new HtmlParser().parse (
264                 new ByteArrayInputStream(test.getBytes("UTF-8")),
265                 new BodyContentHandler(),  metadata, new ParseContext());
266         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
267 
268         metadata = new Metadata();
269         metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
270         new HtmlParser().parse (
271                 new ByteArrayInputStream(test.getBytes("UTF-8")),
272                 new BodyContentHandler(),  metadata, new ParseContext());
273         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
274     }
275 
276     /**
277      * Test case for HTML content like
278      * "&gt;div&lt;foo&gt;br&lt;bar&gt;/div&gt;" that should result
279      * in three whitespace-separated tokens "foo", "bar" and "baz" instead
280      * of a single token "foobarbaz".
281      *
282      * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
283      */
284     public void testLineBreak() throws Exception {
285         String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
286         String text = new Tika().parseToString(
287                 new ByteArrayInputStream(test.getBytes("US-ASCII")));
288         String[] parts = text.trim().split("\\s+");
289         assertEquals(3, parts.length);
290         assertEquals("foo", parts[0]);
291         assertEquals("bar", parts[1]);
292         assertEquals("baz", parts[2]);
293     }
294 
295     /**
296      * Test case for TIKA-339: Don't use language returned by CharsetDetector
297      * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
298      */
299     public void testIgnoreCharsetDetectorLanguage() throws Exception {
300         String test = "<html><title>Simple Content</title><body></body></html>";
301         Metadata metadata = new Metadata();
302         metadata.add(Metadata.CONTENT_LANGUAGE, "en");
303         new HtmlParser().parse (
304                 new ByteArrayInputStream(test.getBytes("UTF-8")),
305                 new BodyContentHandler(),  metadata, new ParseContext());
306 
307         assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
308     }
309 
310     /**
311      * Test case for TIKA-349
312      * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
313      */
314     public void testHttpEquivCharsetFunkyAttributes() throws Exception {
315         String test1 =
316             "<html><head><meta http-equiv=\"content-type\""
317             + " content=\"text/html; charset=ISO-8859-1; charset=iso-8859-1\" />"
318             + "<title>the name is \u00e1ndre</title>"
319             + "</head><body></body></html>";
320         Metadata metadata = new Metadata();
321         new HtmlParser().parse (
322                 new ByteArrayInputStream(test1.getBytes("UTF-8")),
323                 new BodyContentHandler(),  metadata, new ParseContext());
324         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
325 
326         // Some HTML pages have errors like ';;' versus '; ' as separator
327         String test2 =
328             "<html><head><meta http-equiv=\"content-type\""
329             + " content=\"text/html;;charset=ISO-8859-1\" />"
330             + "<title>the name is \u00e1ndre</title>"
331             + "</head><body></body></html>";
332         metadata = new Metadata();
333         new HtmlParser().parse (
334                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
335                 new BodyContentHandler(),  metadata, new ParseContext());
336         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
337     }
338 
339     /**
340      * Test case for TIKA-350
341      * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
342      */
343     public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
344         final String test =
345             "<html><head><title>the name is \u00e1ndre</title></head>"
346             + "<body></body></html>";
347 
348         Metadata metadata = new Metadata();
349         new HtmlParser().parse (
350                 new ByteArrayInputStream(test.getBytes("UTF-8")),
351                 new BodyContentHandler(),  metadata, new ParseContext());
352         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
353 
354         metadata = new Metadata();
355         metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
356         new HtmlParser().parse (
357                 new ByteArrayInputStream(test.getBytes("UTF-8")),
358                 new BodyContentHandler(),  metadata, new ParseContext());
359         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
360     }
361 
362 
363     /**
364      * Test case for TIKA-357
365      * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
366      */
367     public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
368         String path = "/test-documents/big-preamble.html";
369         Metadata metadata = new Metadata();
370         new HtmlParser().parse(
371                 HtmlParserTest.class.getResourceAsStream(path),
372                 new BodyContentHandler(),  metadata, new ParseContext());
373 
374         assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
375     }
376 
377 }