1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.txt;
18  
19  import java.io.ByteArrayInputStream;
20  import java.io.StringWriter;
21  
22  import org.apache.tika.metadata.Metadata;
23  import org.apache.tika.parser.ParseContext;
24  import org.apache.tika.parser.Parser;
25  import org.apache.tika.sax.BodyContentHandler;
26  import org.apache.tika.sax.WriteOutContentHandler;
27  import org.xml.sax.ContentHandler;
28  
29  import junit.framework.TestCase;
30  
31  public class TXTParserTest extends TestCase {
32  
33      private Parser parser = new TXTParser();
34  
35      public void testEnglishText() throws Exception {
36          String text =
37              "Hello, World! This is simple UTF-8 text content written"
38              + " in English to test autodetection of both the character"
39              + " encoding and the language of the input stream.";
40  
41          Metadata metadata = new Metadata();
42          StringWriter writer = new StringWriter();
43          parser.parse(
44                  new ByteArrayInputStream(text.getBytes("UTF-8")),
45                  new WriteOutContentHandler(writer),
46                  metadata,
47                  new ParseContext());
48          String content = writer.toString();
49  
50          assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
51          assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
52          assertEquals("en", metadata.get(Metadata.LANGUAGE));
53          // TODO: ICU reports the content encoding as ISO-8859-1, even though
54          // it could just as well be ASCII or UTF-8, so  for now we won't
55          // test for the Metadata.CONTENT_ENCODING field
56  
57          assertTrue(content.contains("Hello"));
58          assertTrue(content.contains("World"));
59          assertTrue(content.contains("autodetection"));
60          assertTrue(content.contains("stream"));
61      }
62  
63      public void testUTF8Text() throws Exception {
64          String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
65  
66          ContentHandler handler = new BodyContentHandler();
67          Metadata metadata = new Metadata();
68          parser.parse(
69                  new ByteArrayInputStream(text.getBytes("UTF-8")),
70                  handler, metadata, new ParseContext());
71          assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
72          assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
73  
74          assertTrue(handler.toString().contains(text));
75      }
76  
77      public void testEmptyText() throws Exception {
78          ContentHandler handler = new BodyContentHandler();
79          Metadata metadata = new Metadata();
80          parser.parse(
81                  new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
82          assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
83          assertEquals("\n", handler.toString());
84      }
85  
86      /**
87       * Test case for TIKA-240: Drop the BOM when extracting plain text
88       *
89       * @see <a href="https://issues.apache.org/jira/browse/TIKA-240">TIKA-240</a> 
90       */
91      public void testDropByteOrderMark() throws Exception {
92          assertExtractText("UTF-8 BOM", "test", new byte[] {
93                  (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't' });
94          assertExtractText("UTF-16 BE BOM", "test", new byte[] {
95                  (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
96          assertExtractText("UTF-16 LE BOM", "test", new byte[] {
97                  (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
98      }
99  
100     /**
101      * Test case for TIKA-335: using incoming charset
102      *
103      * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> 
104      */
105     public void testUseIncomingCharsetAsHint() throws Exception {
106         // Could be UTF-8 or ISO 8859-1 or ...
107         // u00e1 is latin small letter a with acute
108         final String test2 = "the name is \u00e1ndre";
109 
110         Metadata metadata = new Metadata();
111         parser.parse(
112                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
113                 new BodyContentHandler(),  metadata, new ParseContext());
114         
115         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
116 
117         metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
118         parser.parse(
119                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
120                 new BodyContentHandler(),  metadata, new ParseContext());
121         
122         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
123     }
124 
125     /**
126      * Test case for TIKA-341: using charset in content-type
127      *
128      * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a> 
129      */
130     public void testUsingCharsetInContentTypeHeader() throws Exception {
131         // Could be UTF-8 or ISO 8859-1 or ...
132         // u00e1 is latin small letter a with acute
133         final String test2 = "the name is \u00e1ndre";
134 
135         Metadata metadata = new Metadata();
136         parser.parse(
137                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
138                 new BodyContentHandler(),  metadata, new ParseContext());
139 
140         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
141 
142         metadata = new Metadata();
143         metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
144         parser.parse(
145                 new ByteArrayInputStream(test2.getBytes("UTF-8")),
146                 new BodyContentHandler(),  metadata, new ParseContext());
147 
148         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
149     }
150 
151     private void assertExtractText(String msg, String expected, byte[] input)
152             throws Exception {
153         ContentHandler handler = new BodyContentHandler() {
154             public void ignorableWhitespace(char[] ch, int off, int len) {
155                 // Ignore the whitespace added by XHTMLContentHandler
156             }
157         };
158         Metadata metadata = new Metadata();
159         parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
160         assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
161         assertEquals(msg, expected, handler.toString());
162     }
163 
164     /**
165      * Test case for TIKA-339: don't override incoming language
166      *
167      * @see <a href="https://issues.apache.org/jira/browse/TIKA-335">TIKA-335</a> 
168      */
169     public void testRetainIncomingLanguage() throws Exception {
170         final String test = "Simple Content";
171 
172         Metadata metadata = new Metadata();
173         metadata.set(Metadata.LANGUAGE, "en");
174 
175         parser.parse(
176                 new ByteArrayInputStream(test.getBytes("UTF-8")),
177                 new BodyContentHandler(),  metadata, new ParseContext());
178 
179         assertEquals("en", metadata.get(Metadata.LANGUAGE));
180     }
181 
182 }