1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  
22  import org.apache.tika.exception.TikaException;
23  import org.apache.tika.metadata.Metadata;
24  import org.apache.tika.sax.BodyContentHandler;
25  import org.xml.sax.ContentHandler;
26  
27  import junit.framework.TestCase;
28  
29  public class AutoDetectParserTest extends TestCase {
30  
31      // Easy to read constants for the MIME types:
32      private static final String RAW        = "application/octet-stream";
33      private static final String EXCEL      = "application/vnd.ms-excel";
34      private static final String HTML       = "text/html";
35      private static final String PDF        = "application/pdf";
36      private static final String POWERPOINT = "application/vnd.ms-powerpoint";
37      private static final String RTF        = "application/rtf";
38      private static final String PLAINTEXT  = "text/plain";
39      private static final String WORD       = "application/msword";
40      private static final String XML        = "application/xml";
41      private static final String OPENOFFICE
42              = "application/vnd.oasis.opendocument.text";
43  
44  
45      /**
46       * This is where a single test is done.
47       * @param tp the parameters encapsulated in a TestParams instance
48       * @throws IOException
49       */
50      private void assertAutoDetect(TestParams tp) throws Exception {
51  
52          InputStream input =
53              AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName);
54  
55          if (input == null) {
56              fail("Could not open stream from specified resource: "
57                      + tp.resourceRealName);
58          }
59  
60          try {
61              Metadata metadata = new Metadata();
62              metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
63              metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
64              ContentHandler handler = new BodyContentHandler();
65              new AutoDetectParser().parse(input, handler, metadata);
66  
67              assertEquals("Bad content type: " + tp,
68                      tp.realType, metadata.get(Metadata.CONTENT_TYPE));
69  
70              assertTrue("Expected content not found: " + tp,
71                      handler.toString().contains(tp.expectedContentFragment));
72          } finally {
73              input.close();
74          }
75      }
76  
77      /**
78       * Convenience method -- its sole purpose of existence is to make the
79       * call to it more readable than it would be if a TestParams instance
80       * would need to be instantiated there.
81       *
82       * @param resourceRealName real name of resource
83       * @param resourceStatedName stated name -- will a bad name fool us?
84       * @param realType - the real MIME type
85       * @param statedType - stated MIME type - will a wrong one fool us?
86       * @param expectedContentFragment - something expected in the text
87       * @throws Exception
88       */
89      private void assertAutoDetect(String resourceRealName,
90                                    String resourceStatedName,
91                                    String realType,
92                                    String statedType,
93                                    String expectedContentFragment)
94              throws Exception {
95  
96          assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
97                  realType, statedType, expectedContentFragment));
98      }
99  
100     private void assertAutoDetect(
101             String resource, String type, String content) throws Exception {
102 
103         resource = "/test-documents/" + resource;
104 
105         // TODO !!!!  The disabled tests below should work!
106         // The correct MIME type should be determined regardless of the
107         // stated type (ContentType hint) and the stated URL name.
108 
109 
110         // Try different combinations of correct and incorrect arguments:
111         final String wrongMimeType = RAW;
112         assertAutoDetect(resource, resource, type, type,          content);
113         assertAutoDetect(resource, resource, type, null,          content);
114         assertAutoDetect(resource, resource, type, wrongMimeType, content);
115 
116         assertAutoDetect(resource, null, type, type,          content);
117         assertAutoDetect(resource, null, type, null,          content);
118         assertAutoDetect(resource, null, type, wrongMimeType, content);
119 
120         final String badResource = "a.xyz";
121         assertAutoDetect(resource, badResource, type, type,          content);
122         assertAutoDetect(resource, badResource, type, null,          content);
123         assertAutoDetect(resource, badResource, type, wrongMimeType, content);
124     }
125 
126     public void testEpub() throws Exception {
127         assertAutoDetect(
128                 "testEPUB.epub", "application/epub+zip",
129                 "The previous headings were subchapters");
130     }
131 
132     public void testExcel() throws Exception {
133         assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
134     }
135 
136     public void testHTML() throws Exception {
137         assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
138     }
139 
140     public void testOpenOffice() throws Exception {
141         assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
142                 "This is a sample Open Office document");
143     }
144 
145     public void testPDF() throws Exception {
146         assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
147 
148     }
149 
150     public void testPowerpoint() throws Exception {
151         assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
152     }
153 
154     public void testRdfXml() throws Exception {
155         assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
156     }
157 
158     public void testRTF() throws Exception {
159         assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
160     }
161 
162     public void testText() throws Exception {
163         assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
164     }
165 
166     public void testWord() throws Exception {
167         assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
168     }
169 
170     public void testXML() throws Exception {
171         assertAutoDetect("testXML.xml", XML, "Lius");
172     }
173 
174     /**
175      * Make sure that zip bomb attacks are prevented.
176      *
177      * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
178      */
179     public void testZipBombPrevention() throws Exception {
180         InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
181                 "/test-documents/TIKA-216.tgz");
182         try {
183             Metadata metadata = new Metadata();
184             ContentHandler handler = new BodyContentHandler(-1);
185             new AutoDetectParser().parse(tgz, handler, metadata);
186             fail("Zip bomb was not detected");
187         } catch (TikaException e) {
188             // expected
189         } finally {
190             tgz.close();
191         }
192     
193     }
194 
195     /**
196      * Minimal class to encapsulate all parameters -- the main reason for
197      * its existence is to aid in debugging via its toString() method.
198      *
199      * Getters and setters intentionally not provided.
200      */
201     private static class TestParams {
202 
203         public String resourceRealName;
204         public String resourceStatedName;
205         public String realType;
206         public String statedType;
207         public String expectedContentFragment;
208 
209 
210         private TestParams(String resourceRealName,
211                            String resourceStatedName,
212                            String realType,
213                            String statedType,
214                            String expectedContentFragment) {
215             this.resourceRealName = resourceRealName;
216             this.resourceStatedName = resourceStatedName;
217             this.realType = realType;
218             this.statedType = statedType;
219             this.expectedContentFragment = expectedContentFragment;
220         }
221 
222 
223         /**
224          * Produces a string like the following:
225          *
226          * <pre>
227          * Test parameters:
228          *   resourceRealName        = /test-documents/testEXCEL.xls
229          *   resourceStatedName      = null
230          *   realType                = application/vnd.ms-excel
231          *   statedType              = null
232          *   expectedContentFragment = Sample Excel Worksheet
233          * </pre>
234          */
235         public String toString() {
236             return "Test parameters:\n"
237                 + "  resourceRealName        = " + resourceRealName + "\n"
238                 + "  resourceStatedName      = " + resourceStatedName + "\n"
239                 + "  realType                = " + realType + "\n"
240                 + "  statedType              = " + statedType + "\n"
241                 + "  expectedContentFragment = " + expectedContentFragment + "\n";
242         }
243     }
244 }