1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft.ooxml;
18  
19  import java.io.InputStream;
20  import java.util.Locale;
21  
22  import junit.framework.TestCase;
23  
24  import org.apache.tika.metadata.Metadata;
25  import org.apache.tika.metadata.TikaMetadataKeys;
26  import org.apache.tika.parser.Parser;
27  import org.apache.tika.sax.BodyContentHandler;
28  import org.xml.sax.ContentHandler;
29  
30  import org.apache.tika.parser.AutoDetectParser;
31  
32  public class OOXMLParserTest extends TestCase {
33  
34      // TODO: This is a workaround until TIKA-371 is fixed
35      private Locale defaultLocale;
36  
37      protected void setUp() {
38          defaultLocale = Locale.getDefault();
39          Locale.setDefault(Locale.US);
40      }
41  
42      protected void tearDown() {
43          Locale.setDefault(defaultLocale);
44      }
45  
46      public void testExcel() throws Exception {
47          InputStream input = OOXMLParserTest.class
48                  .getResourceAsStream("/test-documents/testEXCEL.xlsx");
49  
50          Parser parser = new AutoDetectParser();
51          Metadata metadata = new Metadata();
52          // TODO: should auto-detect without the resource name
53          metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
54          ContentHandler handler = new BodyContentHandler();
55  
56          try {
57              parser.parse(input, handler, metadata);
58  
59              assertEquals(
60                      "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
61                      metadata.get(Metadata.CONTENT_TYPE));
62              assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
63              assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
64              String content = handler.toString();
65              assertTrue(content.contains("Sample Excel Worksheet"));
66              assertTrue(content.contains("Numbers and their Squares"));
67              assertTrue(content.contains("9"));
68              assertFalse(content.contains("9.0"));
69              assertTrue(content.contains("196"));
70              assertFalse(content.contains("196.0"));
71              assertEquals("false", metadata.get(TikaMetadataKeys.PROTECTED));
72          } finally {
73              input.close();
74          }
75      }
76  
77      public void testExcelFormats() throws Exception {
78          InputStream input = OOXMLParserTest.class
79                  .getResourceAsStream("/test-documents/testEXCEL-formats.xlsx");
80  
81          Parser parser = new AutoDetectParser();
82          Metadata metadata = new Metadata();
83          // TODO: should auto-detect without the resource name
84          metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL-formats.xlsx");
85          ContentHandler handler = new BodyContentHandler();
86  
87          try {
88              parser.parse(input, handler, metadata);
89  
90              assertEquals(
91                      "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
92                      metadata.get(Metadata.CONTENT_TYPE));
93  
94              String content = handler.toString();
95  
96              // Number #,##0.00
97              assertTrue(content.contains("1,599.99"));
98              assertTrue(content.contains("-1,599.99"));
99  
100             // Currency $#,##0.00;[Red]($#,##0.00)
101             assertTrue(content.contains("$1,599.99"));
102             assertTrue(content.contains("($1,599.99)"));
103 
104             // Scientific 0.00E+00
105             assertTrue(content.contains("1.98E08"));
106             assertTrue(content.contains("-1.98E08"));
107 
108             // Percentage
109             assertTrue(content.contains("2%"));
110             assertTrue(content.contains("2.50%"));
111 
112             // Time Format: h:mm
113             assertTrue(content.contains("6:15"));
114             assertTrue(content.contains("18:15"));
115 
116             // Date Format: d-mmm-yy
117             assertTrue(content.contains("17-May-07"));
118 
119             // Below assertions represent outstanding formatting issues to be addressed
120             // they are included to allow the issues to be progressed with the Apache POI
121             // team - See TIKA-103.
122 
123             /*************************************************************************
124             // Date Format: m/d/yy
125             assertTrue(content.contains("03/10/2009"));
126 
127             // Date/Time Format
128             assertTrue(content.contains("19/01/2008 04:35"));
129 
130             // Custom Number (0 "dollars and" .00 "cents")
131             assertTrue(content.contains("19 dollars and .99 cents"));
132 
133             // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
134             assertTrue(content.contains("At 4:20 AM on Thursday May 17, 2007"));
135 
136             // Fraction (2.5): # ?/?
137             assertTrue(content.contains("2 1 / 2"));
138             **************************************************************************/
139         } finally {
140             input.close();
141         }
142     }
143 
144     public void testPowerPoint() throws Exception {
145         InputStream input = OOXMLParserTest.class
146                 .getResourceAsStream("/test-documents/testPPT.pptx");
147 
148         Parser parser = new AutoDetectParser();
149         Metadata metadata = new Metadata();
150         // TODO: should auto-detect without the resource name
151         metadata.set(Metadata.RESOURCE_NAME_KEY, "testPPT.pptx");
152         ContentHandler handler = new BodyContentHandler();
153 
154         try {
155             parser.parse(input, handler, metadata);
156 
157             assertEquals(
158                     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
159                     metadata.get(Metadata.CONTENT_TYPE));
160             assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
161             assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
162             String content = handler.toString();
163             assertTrue(content.contains("Sample Powerpoint Slide"));
164             assertTrue(content.contains("Powerpoint X for Mac"));
165         } finally {
166             input.close();
167         }
168 
169     }
170 
171     public void testWord() throws Exception {
172         InputStream input = OOXMLParserTest.class
173                 .getResourceAsStream("/test-documents/testWORD.docx");
174 
175         Parser parser = new AutoDetectParser();
176         Metadata metadata = new Metadata();
177         // TODO: should auto-detect without the resource name
178         metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
179         ContentHandler handler = new BodyContentHandler();
180 
181         try {
182             parser.parse(input, handler, metadata);
183 
184             assertEquals(
185                     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
186                     metadata.get(Metadata.CONTENT_TYPE));
187             assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
188             assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
189             assertTrue(handler.toString().contains("Sample Word Document"));
190         } finally {
191             input.close();
192         }
193     }
194 
195     public void testProtectedExcel() throws Exception {
196         InputStream input = OOXMLParserTest.class
197                 .getResourceAsStream("/test-documents/protected.xlsx");
198 
199         Parser parser = new AutoDetectParser();
200         Metadata metadata = new Metadata();
201         ContentHandler handler = new BodyContentHandler();
202 
203         try {
204             parser.parse(input, handler, metadata);
205 
206             assertEquals(
207                     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
208                     metadata.get(Metadata.CONTENT_TYPE));
209 
210             assertEquals("true", metadata.get(TikaMetadataKeys.PROTECTED));
211         } finally {
212             input.close();
213         }
214     }
215 
216 }