1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.tika.mime;
19  
20  // Junit imports
21  import java.io.File;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.net.MalformedURLException;
25  import java.net.URL;
26  
27  import junit.framework.TestCase;
28  
29  import org.apache.tika.config.TikaConfig;
30  import org.apache.tika.metadata.Metadata;
31  
32  /**
33   * 
34   * Test Suite for the {@link MimeTypes} repository.
35   * 
36   */
37  public class TestMimeTypes extends TestCase {
38  
39      private MimeTypes repo;
40  
41      private static URL u;
42  
43      static {
44          try {
45              u = new URL("http://mydomain.com/x.pdf?x=y");
46          } catch (MalformedURLException e) {
47              fail(e.getMessage());
48          }
49      }
50  
51      private static final File f = new File("/a/b/c/x.pdf");
52  
53      public TestMimeTypes() {
54          try {
55              repo = TikaConfig.getDefaultConfig().getMimeRepository();
56          } catch (Exception e) {
57              fail(e.getMessage());
58          }
59  
60      }
61  
62      public void testCaseSensitivity() {
63          MimeType type = repo.getMimeType("test.PDF");
64          assertNotNull(type);
65          assertEquals(repo.getMimeType("test.pdf"), type);
66          assertEquals(repo.getMimeType("test.PdF"), type);
67          assertEquals(repo.getMimeType("test.pdF"), type);
68      }
69  
70      public void testLoadMimeTypes() throws MimeTypeException {
71          assertNotNull(repo.forName("application/octet-stream"));
72          assertNotNull(repo.forName("text/x-tex"));
73      }
74  
75      /**
76       * Tests MIME type determination based solely on the URL's extension.
77       */
78      public void testGuessMimeTypes() throws Exception {
79          assertTypeByName("application/pdf", "x.pdf");
80          assertEquals("application/pdf", repo.getMimeType(u).getName());
81          assertEquals("application/pdf", repo.getMimeType(f).getName());
82          assertTypeByName("text/plain", "x.txt");
83          assertTypeByName("text/html", "x.htm");
84          assertTypeByName("text/html", "x.html");
85          assertTypeByName("application/xhtml+xml", "x.xhtml");
86          assertTypeByName("application/xml", "x.xml");
87          assertTypeByName("application/zip", "x.zip");
88          assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
89          assertTypeByName("application/octet-stream", "x.unknown");
90  
91          // Test for the MS Office media types and file extensions listed in
92          // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
93          assertTypeByName("application/msword", "x.doc");
94          assertTypeByName("application/msword", "x.dot");
95          assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx");
96          assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx");
97          assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm");
98          assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm");
99          assertTypeByName("application/vnd.ms-excel", "x.xls");
100         assertTypeByName("application/vnd.ms-excel", "x.xlt");
101         assertTypeByName("application/vnd.ms-excel", "x.xla");
102         assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx");
103         assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx");
104         assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm");
105         assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm");
106         assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam");
107         assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb");
108         assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
109         assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
110         assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
111         assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
112         assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx");
113         assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx");
114         assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx");
115         assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam");
116         assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm");
117         assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm");
118         assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
119     }
120 
121     public void testOoxmlDetection() throws Exception {
122         assertTypeByData("application/x-tika-ooxml", "testWORD.docx");
123         assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
124         assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
125     }
126 
127     public void testJpegDetection() throws Exception {
128         assertType("image/jpeg", "testJPEG.jpg");
129         assertTypeByData("image/jpeg", "testJPEG.jpg");
130         assertTypeByName("image/jpeg", "x.jpg");
131         assertTypeByName("image/jpeg", "x.JPG");
132         assertTypeByName("image/jpeg", "x.jpeg");
133         assertTypeByName("image/jpeg", "x.JPEG");
134         assertTypeByName("image/jpeg", "x.jpe");
135         assertTypeByName("image/jpeg", "x.jif");
136         assertTypeByName("image/jpeg", "x.jfif");
137         assertTypeByName("image/jpeg", "x.jfi");
138     }
139 
140     public void testTiffDetection() throws Exception {
141         assertType("image/tiff", "testTIFF.tif");
142         assertTypeByData("image/tiff", "testTIFF.tif");
143         assertTypeByName("image/tiff", "x.tiff");
144         assertTypeByName("image/tiff", "x.tif");
145         assertTypeByName("image/tiff", "x.TIF");
146     }
147 
148     public void testGifDetection() throws Exception {
149         assertType("image/gif", "testGIF.gif");
150         assertTypeByData("image/gif", "testGIF.gif");
151         assertTypeByName("image/gif", "x.gif");
152         assertTypeByName("image/gif", "x.GIF");
153     }
154 
155     public void testPngDetection() throws Exception {
156         assertType("image/png", "testPNG.png");
157         assertTypeByData("image/png", "testPNG.png");
158         assertTypeByName("image/png", "x.png");
159         assertTypeByName("image/png", "x.PNG");
160     }
161 
162     public void testBmpDetection() throws Exception {
163         assertType("image/bmp", "testBMP.bmp");
164         assertTypeByData("image/bmp", "testBMP.bmp");
165         assertTypeByName("image/bmp", "x.bmp");
166         assertTypeByName("image/bmp", "x.BMP");
167         assertTypeByName("image/bmp", "x.dib");
168         assertTypeByName("image/bmp", "x.DIB");
169     }
170 
171     public void testPnmDetection() throws Exception {
172         assertType("image/x-portable-bitmap", "testPBM.pbm");
173         assertType("image/x-portable-graymap", "testPGM.pgm");
174         assertType("image/x-portable-pixmap", "testPPM.ppm");
175         assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
176         assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
177         assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
178         assertTypeByName("image/x-portable-anymap", "x.pnm");
179         assertTypeByName("image/x-portable-anymap", "x.PNM");
180         assertTypeByName("image/x-portable-bitmap", "x.pbm");
181         assertTypeByName("image/x-portable-bitmap", "x.PBM");
182         assertTypeByName("image/x-portable-graymap", "x.pgm");
183         assertTypeByName("image/x-portable-graymap", "x.PGM");
184         assertTypeByName("image/x-portable-pixmap", "x.ppm");
185         assertTypeByName("image/x-portable-pixmap", "x.PPM");
186     }
187 
188     public void testCgmDetection() throws Exception {
189         // TODO: Need a test image file
190         assertTypeByName("image/cgm", "x.cgm");
191         assertTypeByName("image/cgm", "x.CGM");
192     }
193 
194     public void testRdfXmlDetection() throws Exception {
195         assertTypeByName("application/rdf+xml", "x.rdf");
196         assertTypeByName("application/rdf+xml", "x.owl");
197     }
198 
199     public void testSvgDetection() throws Exception {
200         assertType("image/svg+xml", "testSVG.svg");
201         assertTypeByData("image/svg+xml", "testSVG.svg");
202         assertTypeByName("image/svg+xml", "x.svg");
203         assertTypeByName("image/svg+xml", "x.SVG");
204 
205         // Should *.svgz be svg or gzip
206         assertType("application/x-gzip", "testSVG.svgz");
207         assertTypeByData("application/x-gzip", "testSVG.svgz");
208         assertTypeByName("image/svg+xml", "x.svgz");
209         assertTypeByName("image/svg+xml", "x.SVGZ");
210     }
211 
212     public void testPdfDetection() throws Exception {
213         assertType("application/pdf", "testPDF.pdf");
214         assertTypeByData("application/pdf", "testPDF.pdf");
215         assertTypeByName("application/pdf", "x.pdf");
216         assertTypeByName("application/pdf", "x.PDF");
217     }
218 
219     public void testSwfDetection() throws Exception {
220         // TODO: Need a test flash file
221         assertTypeByName("application/x-shockwave-flash", "x.swf");
222         assertTypeByName("application/x-shockwave-flash", "x.SWF");
223     }
224 
225     public void testWmfDetection() throws Exception {
226         // TODO: Need a test wmf file
227         assertTypeByName("application/x-msmetafile", "x.wmf");
228         assertTypeByName("application/x-msmetafile", "x.WMF");
229         // TODO: Need a test emf file
230         assertTypeByName("application/x-msmetafile", "x.emf");
231         assertTypeByName("application/x-msmetafile", "x.EMF");
232         // TODO: Need a test wmz file
233         assertTypeByName("application/x-ms-wmz", "x.wmz");
234         assertTypeByName("application/x-ms-wmz", "x.WMZ");
235         // TODO: Need a test emf file
236         assertTypeByName("application/x-gzip", "x.emz");
237         assertTypeByName("application/x-gzip", "x.EMZ");
238     }
239 
240     public void testPsDetection() throws Exception {
241         // TODO: Need a test postscript file
242         assertTypeByName("application/postscript", "x.ps");
243         assertTypeByName("application/postscript", "x.PS");
244         assertTypeByName("application/postscript", "x.eps");
245         assertTypeByName("application/postscript", "x.epsf");
246         assertTypeByName("application/postscript", "x.epsi");
247     }
248 
249     /**
250      * @since TIKA-194
251      */
252     public void testJavaRegex() throws Exception{
253         MimeType testType = new MimeType(this.repo, "foo/bar");
254         this.repo.add(testType);
255         assertNotNull(repo.forName("foo/bar"));
256         String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
257         this.repo.addPattern(testType, pattern, true);
258         String testFileName = "rtg_sst_grb_0.5.12345678";
259         assertNotNull(this.repo.getMimeType(testFileName));
260         assertEquals(this.repo.getMimeType(testFileName).getName(), "foo/bar");    
261         
262         MimeType testType2 = new MimeType(this.repo, "foo/bar2");
263         this.repo.add(testType2);
264         assertNotNull(repo.forName("foo/bar2"));
265         this.repo.addPattern(testType2, pattern, false);
266         assertNotNull(this.repo.getMimeType(testFileName));
267         assertNotSame("foo/bar2", this.repo.getMimeType(testFileName).getName());
268     }
269     
270     public void testRawDetection() throws Exception {
271         assertTypeByName("image/x-raw-adobe", "x.dng");
272         assertTypeByName("image/x-raw-adobe", "x.DNG");
273         assertTypeByName("image/x-raw-hasselblad", "x.3fr");
274         assertTypeByName("image/x-raw-fuji", "x.raf");
275         assertTypeByName("image/x-raw-canon", "x.crw");
276         assertTypeByName("image/x-raw-canon", "x.cr2");
277         assertTypeByName("image/x-raw-kodak", "x.k25");
278         assertTypeByName("image/x-raw-kodak", "x.kdc");
279         assertTypeByName("image/x-raw-kodak", "x.dcs");
280         assertTypeByName("image/x-raw-kodak", "x.drf");
281         assertTypeByName("image/x-raw-minolta", "x.mrw");
282         assertTypeByName("image/x-raw-nikon", "x.nef");
283         assertTypeByName("image/x-raw-nikon", "x.nrw");
284         assertTypeByName("image/x-raw-olympus", "x.orf");
285         assertTypeByName("image/x-raw-pentax", "x.ptx");
286         assertTypeByName("image/x-raw-pentax", "x.pef");
287         assertTypeByName("image/x-raw-sony", "x.arw");
288         assertTypeByName("image/x-raw-sony", "x.srf");
289         assertTypeByName("image/x-raw-sony", "x.sr2");
290         assertTypeByName("image/x-raw-sigma", "x.x3f");
291         assertTypeByName("image/x-raw-epson", "x.erf");
292         assertTypeByName("image/x-raw-mamiya", "x.mef");
293         assertTypeByName("image/x-raw-leaf", "x.mos");
294         assertTypeByName("image/x-raw-panasonic", "x.raw");
295         assertTypeByName("image/x-raw-panasonic", "x.rw2");
296         assertTypeByName("image/x-raw-phaseone", "x.cap");
297         assertTypeByName("image/x-raw-phaseone", "x.iiq");
298         assertTypeByName("image/x-raw-phaseone", "x.cap");
299         assertTypeByName("image/x-raw-red", "x.r3d");
300         assertTypeByName("image/x-raw-imacon", "x.fff");
301         assertTypeByName("image/x-raw-logitech", "x.pxn");
302         assertTypeByName("image/x-raw-casio", "x.bay");
303         assertTypeByName("image/x-raw-rawzor", "x.rwz");
304     }
305 
306     /**
307      * Tests MimeTypes.getMimeType(URL), which examines both the byte header
308      * and, if necessary, the URL's extension.
309      */
310     public void testMimeDeterminationForTestDocuments() throws Exception {
311         assertType("text/html", "testHTML.html");
312         assertType("application/zip", "test-documents.zip");
313         // TODO: Currently returns generic MS Office type based on
314         // the magic header. The getMimeType method should understand
315         // MS Office types better.
316         // assertEquals("application/vnd.ms-excel",
317         // getMimeType("testEXCEL.xls"));
318         // assertEquals("application/vnd.ms-powerpoint",
319         // getMimeType("testPPT.ppt"));
320         // assertEquals("application/msword", getMimeType("testWORD.doc"));
321         assertType("text/html", "testHTML_utf8.html");
322         assertType(
323                 "application/vnd.oasis.opendocument.text",
324                 "testOpenOffice2.odt");
325         assertType("application/pdf", "testPDF.pdf");
326         assertType("application/rtf", "testRTF.rtf");
327         assertType("text/plain", "testTXT.txt");
328         assertType("application/xml", "testXML.xml");
329         assertType("audio/basic", "testAU.au");
330         assertType("audio/x-aiff", "testAIFF.aif");
331         assertType("audio/x-wav", "testWAV.wav");
332         assertType("audio/midi", "testMID.mid");
333     }
334 
335     private void assertType(String expected, String filename) throws Exception {
336         InputStream stream = TestMimeTypes.class.getResourceAsStream(
337                 "/test-documents/" + filename);
338         try {
339             Metadata metadata = new Metadata();
340             metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
341             assertEquals(expected, repo.detect(stream, metadata).toString());
342         } finally {
343             stream.close();
344         }
345     }
346 
347     private void assertTypeByName(String expected, String filename)
348             throws IOException {
349         Metadata metadata = new Metadata();
350         metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
351         assertEquals(expected, repo.detect(null, metadata).toString());
352     }
353 
354     private void assertTypeByData(String expected, String filename)
355             throws IOException {
356         InputStream stream = TestMimeTypes.class.getResourceAsStream(
357                 "/test-documents/" + filename);
358         try {
359             Metadata metadata = new Metadata();
360             assertEquals(expected, repo.detect(stream, metadata).toString());
361         } finally {
362             stream.close();
363         }
364     }
365 
366 }