1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.txt;
18
19 import java.io.ByteArrayInputStream;
20 import java.io.StringWriter;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.parser.ParseContext;
24 import org.apache.tika.parser.Parser;
25 import org.apache.tika.sax.BodyContentHandler;
26 import org.apache.tika.sax.WriteOutContentHandler;
27 import org.xml.sax.ContentHandler;
28
29 import junit.framework.TestCase;
30
31 public class TXTParserTest extends TestCase {
32
33 private Parser parser = new TXTParser();
34
35 public void testEnglishText() throws Exception {
36 String text =
37 "Hello, World! This is simple UTF-8 text content written"
38 + " in English to test autodetection of both the character"
39 + " encoding and the language of the input stream.";
40
41 Metadata metadata = new Metadata();
42 StringWriter writer = new StringWriter();
43 parser.parse(
44 new ByteArrayInputStream(text.getBytes("UTF-8")),
45 new WriteOutContentHandler(writer),
46 metadata,
47 new ParseContext());
48 String content = writer.toString();
49
50 assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
51 assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
52 assertEquals("en", metadata.get(Metadata.LANGUAGE));
53
54
55
56
57 assertTrue(content.contains("Hello"));
58 assertTrue(content.contains("World"));
59 assertTrue(content.contains("autodetection"));
60 assertTrue(content.contains("stream"));
61 }
62
63 public void testUTF8Text() throws Exception {
64 String text = "I\u00F1t\u00EBrn\u00E2ti\u00F4n\u00E0liz\u00E6ti\u00F8n";
65
66 ContentHandler handler = new BodyContentHandler();
67 Metadata metadata = new Metadata();
68 parser.parse(
69 new ByteArrayInputStream(text.getBytes("UTF-8")),
70 handler, metadata, new ParseContext());
71 assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
72 assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
73
74 assertTrue(handler.toString().contains(text));
75 }
76
77 public void testEmptyText() throws Exception {
78 ContentHandler handler = new BodyContentHandler();
79 Metadata metadata = new Metadata();
80 parser.parse(
81 new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
82 assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
83 assertEquals("\n", handler.toString());
84 }
85
86
87
88
89
90
91 public void testDropByteOrderMark() throws Exception {
92 assertExtractText("UTF-8 BOM", "test", new byte[] {
93 (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 't', 'e', 's', 't' });
94 assertExtractText("UTF-16 BE BOM", "test", new byte[] {
95 (byte) 0xFE, (byte) 0xFF, 0, 't', 0, 'e', 0, 's', 0, 't'});
96 assertExtractText("UTF-16 LE BOM", "test", new byte[] {
97 (byte) 0xFF, (byte) 0xFE, 't', 0, 'e', 0, 's', 0, 't', 0});
98 }
99
100
101
102
103
104
105 public void testUseIncomingCharsetAsHint() throws Exception {
106
107
108 final String test2 = "the name is \u00e1ndre";
109
110 Metadata metadata = new Metadata();
111 parser.parse(
112 new ByteArrayInputStream(test2.getBytes("UTF-8")),
113 new BodyContentHandler(), metadata, new ParseContext());
114
115 assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
116
117 metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
118 parser.parse(
119 new ByteArrayInputStream(test2.getBytes("UTF-8")),
120 new BodyContentHandler(), metadata, new ParseContext());
121
122 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
123 }
124
125
126
127
128
129
130 public void testUsingCharsetInContentTypeHeader() throws Exception {
131
132
133 final String test2 = "the name is \u00e1ndre";
134
135 Metadata metadata = new Metadata();
136 parser.parse(
137 new ByteArrayInputStream(test2.getBytes("UTF-8")),
138 new BodyContentHandler(), metadata, new ParseContext());
139
140 assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
141
142 metadata = new Metadata();
143 metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
144 parser.parse(
145 new ByteArrayInputStream(test2.getBytes("UTF-8")),
146 new BodyContentHandler(), metadata, new ParseContext());
147
148 assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
149 }
150
151 private void assertExtractText(String msg, String expected, byte[] input)
152 throws Exception {
153 ContentHandler handler = new BodyContentHandler() {
154 public void ignorableWhitespace(char[] ch, int off, int len) {
155
156 }
157 };
158 Metadata metadata = new Metadata();
159 parser.parse(new ByteArrayInputStream(input), handler, metadata, new ParseContext());
160 assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
161 assertEquals(msg, expected, handler.toString());
162 }
163
164
165
166
167
168
169 public void testRetainIncomingLanguage() throws Exception {
170 final String test = "Simple Content";
171
172 Metadata metadata = new Metadata();
173 metadata.set(Metadata.LANGUAGE, "en");
174
175 parser.parse(
176 new ByteArrayInputStream(test.getBytes("UTF-8")),
177 new BodyContentHandler(), metadata, new ParseContext());
178
179 assertEquals("en", metadata.get(Metadata.LANGUAGE));
180 }
181
182 }