1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.sax.BodyContentHandler;
25 import org.xml.sax.ContentHandler;
26
27 import junit.framework.TestCase;
28
29 public class AutoDetectParserTest extends TestCase {
30
31
32 private static final String RAW = "application/octet-stream";
33 private static final String EXCEL = "application/vnd.ms-excel";
34 private static final String HTML = "text/html";
35 private static final String PDF = "application/pdf";
36 private static final String POWERPOINT = "application/vnd.ms-powerpoint";
37 private static final String RTF = "application/rtf";
38 private static final String PLAINTEXT = "text/plain";
39 private static final String WORD = "application/msword";
40 private static final String XML = "application/xml";
41 private static final String OPENOFFICE
42 = "application/vnd.oasis.opendocument.text";
43
44
45
46
47
48
49
50 private void assertAutoDetect(TestParams tp) throws Exception {
51
52 InputStream input =
53 AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName);
54
55 if (input == null) {
56 fail("Could not open stream from specified resource: "
57 + tp.resourceRealName);
58 }
59
60 try {
61 Metadata metadata = new Metadata();
62 metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
63 metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
64 ContentHandler handler = new BodyContentHandler();
65 new AutoDetectParser().parse(input, handler, metadata);
66
67 assertEquals("Bad content type: " + tp,
68 tp.realType, metadata.get(Metadata.CONTENT_TYPE));
69
70 assertTrue("Expected content not found: " + tp,
71 handler.toString().contains(tp.expectedContentFragment));
72 } finally {
73 input.close();
74 }
75 }
76
77
78
79
80
81
82
83
84
85
86
87
88
89 private void assertAutoDetect(String resourceRealName,
90 String resourceStatedName,
91 String realType,
92 String statedType,
93 String expectedContentFragment)
94 throws Exception {
95
96 assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
97 realType, statedType, expectedContentFragment));
98 }
99
100 private void assertAutoDetect(
101 String resource, String type, String content) throws Exception {
102
103 resource = "/test-documents/" + resource;
104
105
106
107
108
109
110
111 final String wrongMimeType = RAW;
112 assertAutoDetect(resource, resource, type, type, content);
113 assertAutoDetect(resource, resource, type, null, content);
114 assertAutoDetect(resource, resource, type, wrongMimeType, content);
115
116 assertAutoDetect(resource, null, type, type, content);
117 assertAutoDetect(resource, null, type, null, content);
118 assertAutoDetect(resource, null, type, wrongMimeType, content);
119
120 final String badResource = "a.xyz";
121 assertAutoDetect(resource, badResource, type, type, content);
122 assertAutoDetect(resource, badResource, type, null, content);
123 assertAutoDetect(resource, badResource, type, wrongMimeType, content);
124 }
125
126 public void testEpub() throws Exception {
127 assertAutoDetect(
128 "testEPUB.epub", "application/epub+zip",
129 "The previous headings were subchapters");
130 }
131
132 public void testExcel() throws Exception {
133 assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
134 }
135
136 public void testHTML() throws Exception {
137 assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
138 }
139
140 public void testOpenOffice() throws Exception {
141 assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
142 "This is a sample Open Office document");
143 }
144
145 public void testPDF() throws Exception {
146 assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
147
148 }
149
150 public void testPowerpoint() throws Exception {
151 assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
152 }
153
154 public void testRdfXml() throws Exception {
155 assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
156 }
157
158 public void testRTF() throws Exception {
159 assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
160 }
161
162 public void testText() throws Exception {
163 assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
164 }
165
166 public void testWord() throws Exception {
167 assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
168 }
169
170 public void testXML() throws Exception {
171 assertAutoDetect("testXML.xml", XML, "Lius");
172 }
173
174
175
176
177
178
179 public void testZipBombPrevention() throws Exception {
180 InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
181 "/test-documents/TIKA-216.tgz");
182 try {
183 Metadata metadata = new Metadata();
184 ContentHandler handler = new BodyContentHandler(-1);
185 new AutoDetectParser().parse(tgz, handler, metadata);
186 fail("Zip bomb was not detected");
187 } catch (TikaException e) {
188
189 } finally {
190 tgz.close();
191 }
192
193 }
194
195
196
197
198
199
200
201 private static class TestParams {
202
203 public String resourceRealName;
204 public String resourceStatedName;
205 public String realType;
206 public String statedType;
207 public String expectedContentFragment;
208
209
210 private TestParams(String resourceRealName,
211 String resourceStatedName,
212 String realType,
213 String statedType,
214 String expectedContentFragment) {
215 this.resourceRealName = resourceRealName;
216 this.resourceStatedName = resourceStatedName;
217 this.realType = realType;
218 this.statedType = statedType;
219 this.expectedContentFragment = expectedContentFragment;
220 }
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235 public String toString() {
236 return "Test parameters:\n"
237 + " resourceRealName = " + resourceRealName + "\n"
238 + " resourceStatedName = " + resourceStatedName + "\n"
239 + " realType = " + realType + "\n"
240 + " statedType = " + statedType + "\n"
241 + " expectedContentFragment = " + expectedContentFragment + "\n";
242 }
243 }
244 }