1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika;
18
19 import java.io.File;
20 import java.io.FileInputStream;
21 import java.io.InputStream;
22 import java.net.URL;
23
24 import junit.framework.TestCase;
25
26 import org.apache.tika.config.TikaConfig;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.parser.Parser;
29 import org.apache.tika.utils.ParseUtils;
30 import org.xml.sax.helpers.DefaultHandler;
31
32
33
34
35 public class TestParsers extends TestCase {
36
37 private TikaConfig tc;
38
39 public void setUp() throws Exception {
40 tc = TikaConfig.getDefaultConfig();
41 }
42
43 public void testPDFExtraction() throws Exception {
44 File file = getResourceAsFile("/test-documents/testPDF.pdf");
45 String s1 = ParseUtils.getStringContent(file, tc);
46 String s2 = ParseUtils.getStringContent(file, tc, "application/pdf");
47 String s3 = ParseUtils.getStringContent(file, TikaConfig
48 .getDefaultConfig());
49 assertEquals(s1, s2);
50 assertEquals(s1, s3);
51 }
52
53 public void testTXTExtraction() throws Exception {
54 File file = getResourceAsFile("/test-documents/testTXT.txt");
55 String s1 = ParseUtils.getStringContent(file, tc);
56 String s2 = ParseUtils.getStringContent(file, tc, "text/plain");
57 assertEquals(s1, s2);
58 }
59
60 public void testRTFExtraction() throws Exception {
61 File file = getResourceAsFile("/test-documents/testRTF.rtf");
62 String s1 = ParseUtils.getStringContent(file, tc);
63 String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
64 assertEquals(s1, s2);
65 }
66
67 public void testXMLExtraction() throws Exception {
68 File file = getResourceAsFile("/test-documents/testXML.xml");
69 String s1 = ParseUtils.getStringContent(file, tc);
70 String s2 = ParseUtils.getStringContent(file, tc, "application/xml");
71 assertEquals(s1, s2);
72 }
73
74 public void testPPTExtraction() throws Exception {
75 File file = getResourceAsFile("/test-documents/testPPT.ppt");
76 String s1 = ParseUtils.getStringContent(file, tc);
77 String s2 = ParseUtils.getStringContent(file, tc,
78 "application/vnd.ms-powerpoint");
79 assertEquals(s1, s2);
80 Parser parser = tc.getParser("application/vnd.ms-powerpoint");
81 Metadata metadata = new Metadata();
82 InputStream stream = new FileInputStream(file);
83 try {
84 parser.parse(stream, new DefaultHandler(), metadata);
85 } finally {
86 stream.close();
87 }
88 assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
89 }
90
91 public void testWORDxtraction() throws Exception {
92 File file = getResourceAsFile("/test-documents/testWORD.doc");
93 String s1 = ParseUtils.getStringContent(file, tc);
94 String s2 = ParseUtils.getStringContent(file, tc, "application/msword");
95 assertEquals(s1, s2);
96 Parser parser = tc.getParser("application/msword");
97 Metadata metadata = new Metadata();
98 InputStream stream = new FileInputStream(file);
99 try {
100 parser.parse(stream, new DefaultHandler(), metadata);
101 } finally {
102 stream.close();
103 }
104 assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
105 }
106
107 public void testEXCELExtraction() throws Exception {
108 final String expected = "Numbers and their Squares";
109 File file = getResourceAsFile("/test-documents/testEXCEL.xls");
110 String s1 = ParseUtils.getStringContent(file, tc);
111 String s2 = ParseUtils.getStringContent(file, tc,
112 "application/vnd.ms-excel");
113 assertEquals(s1, s2);
114 assertTrue("Text does not contain '" + expected + "'", s1
115 .contains(expected));
116 Parser parser = tc.getParser("application/vnd.ms-excel");
117 Metadata metadata = new Metadata();
118 InputStream stream = new FileInputStream(file);
119 try {
120 parser.parse(stream, new DefaultHandler(), metadata);
121 } finally {
122 stream.close();
123 }
124 assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
125 }
126
127 public void testOOExtraction() throws Exception {
128 File file = getResourceAsFile("/test-documents/testOpenOffice2.odt");
129 String s1 = ParseUtils.getStringContent(file, tc);
130 String s2 = ParseUtils.getStringContent(file, tc,
131 "application/vnd.oasis.opendocument.text");
132 assertEquals(s1, s2);
133 }
134
135 public void testOutlookExtraction() throws Exception {
136 File file = getResourceAsFile("/test-documents/test-outlook.msg");
137 String s1 = ParseUtils.getStringContent(file, tc);
138 String s2 = ParseUtils.getStringContent(file, tc,
139 "application/vnd.ms-outlook");
140 assertEquals(s1, s2);
141 }
142
143 public void testHTMLExtraction() throws Exception {
144 File file = getResourceAsFile("/test-documents/testHTML.html");
145 String s1 = ParseUtils.getStringContent(file, tc);
146 String s2 = ParseUtils.getStringContent(file, tc, "text/html");
147 assertEquals(s1, s2);
148
149 Parser parser = tc.getParser("text/html");
150 assertNotNull(parser);
151 }
152
153 public void testZipFileExtraction() throws Exception {
154 File file = getResourceAsFile("/test-documents/test-documents.zip");
155 String s1 = ParseUtils.getStringContent(file, tc);
156 String s2 = ParseUtils.getStringContent(file, tc, "application/zip");
157 assertEquals(s1, s2);
158
159 Parser parser = tc.getParser("application/zip");
160 assertNotNull(parser);
161 }
162
163 public void testMP3Extraction() throws Exception {
164 File file = getResourceAsFile("/test-documents/testMP3id3v1.mp3");
165 String s1 = ParseUtils.getStringContent(file, tc);
166 String s2 = ParseUtils.getStringContent(file, tc, "audio/mpeg");
167 assertEquals(s1, s2);
168
169 Parser parser = tc.getParser("audio/mpeg");
170 assertNotNull(parser);
171 }
172
173
174
175
176
177
178
179
180
181
182
183 public String getFileResource(String name) {
184 URL url = this.getClass().getResource(name);
185 if (url != null) {
186 return url.getFile();
187 } else {
188
189
190 url = this.getClass().getResource(".");
191 return url.getFile() + name;
192 }
193 }
194
195 public File getResourceAsFile(String filename) {
196 return new File(getFileResource(filename));
197 }
198
199 public InputStream getResourceAsStream(String name) {
200 return this.getClass().getResourceAsStream(name);
201 }
202
203 }