1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft.ooxml;
18
19 import java.io.InputStream;
20 import java.util.Locale;
21
22 import junit.framework.TestCase;
23
24 import org.apache.tika.metadata.Metadata;
25 import org.apache.tika.metadata.TikaMetadataKeys;
26 import org.apache.tika.parser.Parser;
27 import org.apache.tika.sax.BodyContentHandler;
28 import org.xml.sax.ContentHandler;
29
30 import org.apache.tika.parser.AutoDetectParser;
31
32 public class OOXMLParserTest extends TestCase {
33
34
35 private Locale defaultLocale;
36
37 protected void setUp() {
38 defaultLocale = Locale.getDefault();
39 Locale.setDefault(Locale.US);
40 }
41
42 protected void tearDown() {
43 Locale.setDefault(defaultLocale);
44 }
45
46 public void testExcel() throws Exception {
47 InputStream input = OOXMLParserTest.class
48 .getResourceAsStream("/test-documents/testEXCEL.xlsx");
49
50 Parser parser = new AutoDetectParser();
51 Metadata metadata = new Metadata();
52
53 metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
54 ContentHandler handler = new BodyContentHandler();
55
56 try {
57 parser.parse(input, handler, metadata);
58
59 assertEquals(
60 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
61 metadata.get(Metadata.CONTENT_TYPE));
62 assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
63 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
64 String content = handler.toString();
65 assertTrue(content.contains("Sample Excel Worksheet"));
66 assertTrue(content.contains("Numbers and their Squares"));
67 assertTrue(content.contains("9"));
68 assertFalse(content.contains("9.0"));
69 assertTrue(content.contains("196"));
70 assertFalse(content.contains("196.0"));
71 assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
72 } finally {
73 input.close();
74 }
75 }
76
77 public void testExcelFormats() throws Exception {
78 InputStream input = OOXMLParserTest.class
79 .getResourceAsStream("/test-documents/testEXCEL-formats.xlsx");
80
81 Parser parser = new AutoDetectParser();
82 Metadata metadata = new Metadata();
83
84 metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
85 ContentHandler handler = new BodyContentHandler();
86
87 try {
88 parser.parse(input, handler, metadata);
89
90 assertEquals(
91 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
92 metadata.get(Metadata.CONTENT_TYPE));
93
94 String content = handler.toString();
95
96
97 assertTrue(content.contains("1,599.99"));
98 assertTrue(content.contains("-1,599.99"));
99
100
101 assertTrue(content.contains("$1,599.99"));
102 assertTrue(content.contains("($1,599.99)"));
103
104
105 assertTrue(content.contains("1.98E08"));
106 assertTrue(content.contains("-1.98E08"));
107
108
109 assertTrue(content.contains("2%"));
110 assertTrue(content.contains("2.50%"));
111
112
113 assertTrue(content.contains("6:15"));
114 assertTrue(content.contains("18:15"));
115
116
117 assertTrue(content.contains("17-May-07"));
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139 } finally {
140 input.close();
141 }
142 }
143
144 public void testPowerPoint() throws Exception {
145 InputStream input = OOXMLParserTest.class
146 .getResourceAsStream("/test-documents/testPPT.pptx");
147
148 Parser parser = new AutoDetectParser();
149 Metadata metadata = new Metadata();
150
151 metadata.set(Metadata.RESOURCE_NAME_KEY, "testPPT.pptx");
152 ContentHandler handler = new BodyContentHandler();
153
154 try {
155 parser.parse(input, handler, metadata);
156
157 assertEquals(
158 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
159 metadata.get(Metadata.CONTENT_TYPE));
160 assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
161 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
162 String content = handler.toString();
163 assertTrue(content.contains("Sample Powerpoint Slide"));
164 assertTrue(content.contains("Powerpoint X for Mac"));
165 } finally {
166 input.close();
167 }
168
169 }
170
171 public void testWord() throws Exception {
172 InputStream input = OOXMLParserTest.class
173 .getResourceAsStream("/test-documents/testWORD.docx");
174
175 Parser parser = new AutoDetectParser();
176 Metadata metadata = new Metadata();
177
178 metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
179 ContentHandler handler = new BodyContentHandler();
180
181 try {
182 parser.parse(input, handler, metadata);
183
184 assertEquals(
185 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
186 metadata.get(Metadata.CONTENT_TYPE));
187 assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
188 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
189 assertTrue(handler.toString().contains("Sample Word Document"));
190 } finally {
191 input.close();
192 }
193 }
194
195 public void testProtectedExcel() throws Exception {
196 InputStream input = OOXMLParserTest.class
197 .getResourceAsStream("/test-documents/protected.xlsx");
198
199 Parser parser = new AutoDetectParser();
200 Metadata metadata = new Metadata();
201 ContentHandler handler = new BodyContentHandler();
202
203 try {
204 parser.parse(input, handler, metadata);
205
206 assertEquals(
207 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
208 metadata.get(Metadata.CONTENT_TYPE));
209
210 assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
211 } finally {
212 input.close();
213 }
214 }
215
216 }