1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika;
18  
19  import java.io.File;
20  import java.io.FileInputStream;
21  import java.io.InputStream;
22  import java.net.URL;
23  
24  import junit.framework.TestCase;
25  
26  import org.apache.tika.config.TikaConfig;
27  import org.apache.tika.metadata.Metadata;
28  import org.apache.tika.parser.Parser;
29  import org.apache.tika.utils.ParseUtils;
30  import org.xml.sax.helpers.DefaultHandler;
31  
32  /**
33   * Junit test class for Tika {@link Parser}s.
34   */
35  public class TestParsers extends TestCase {
36  
37      private TikaConfig tc;
38  
39      public void setUp() throws Exception {
40          tc = TikaConfig.getDefaultConfig();
41      }
42  
43      public void testPDFExtraction() throws Exception {
44          File file = getResourceAsFile("/test-documents/testPDF.pdf");
45          String s1 = ParseUtils.getStringContent(file, tc);
46          String s2 = ParseUtils.getStringContent(file, tc, "application/pdf");
47          String s3 = ParseUtils.getStringContent(file, TikaConfig
48                  .getDefaultConfig());
49          assertEquals(s1, s2);
50          assertEquals(s1, s3);
51      }
52  
53      public void testTXTExtraction() throws Exception {
54          File file = getResourceAsFile("/test-documents/testTXT.txt");
55          String s1 = ParseUtils.getStringContent(file, tc);
56          String s2 = ParseUtils.getStringContent(file, tc, "text/plain");
57          assertEquals(s1, s2);
58      }
59  
60      public void testRTFExtraction() throws Exception {
61          File file = getResourceAsFile("/test-documents/testRTF.rtf");
62          String s1 = ParseUtils.getStringContent(file, tc);
63          String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
64          assertEquals(s1, s2);
65      }
66  
67      public void testXMLExtraction() throws Exception {
68          File file = getResourceAsFile("/test-documents/testXML.xml");
69          String s1 = ParseUtils.getStringContent(file, tc);
70          String s2 = ParseUtils.getStringContent(file, tc, "application/xml");
71          assertEquals(s1, s2);
72      }
73  
74      public void testPPTExtraction() throws Exception {
75          File file = getResourceAsFile("/test-documents/testPPT.ppt");
76          String s1 = ParseUtils.getStringContent(file, tc);
77          String s2 = ParseUtils.getStringContent(file, tc,
78                  "application/vnd.ms-powerpoint");
79          assertEquals(s1, s2);
80          Parser parser = tc.getParser("application/vnd.ms-powerpoint");
81          Metadata metadata = new Metadata();
82          InputStream stream = new FileInputStream(file);
83          try {
84              parser.parse(stream, new DefaultHandler(), metadata);
85          } finally {
86              stream.close();
87          }
88          assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
89      }
90  
91      public void testWORDxtraction() throws Exception {
92          File file = getResourceAsFile("/test-documents/testWORD.doc");
93          String s1 = ParseUtils.getStringContent(file, tc);
94          String s2 = ParseUtils.getStringContent(file, tc, "application/msword");
95          assertEquals(s1, s2);
96          Parser parser = tc.getParser("application/msword");
97          Metadata metadata = new Metadata();
98          InputStream stream = new FileInputStream(file);
99          try {
100             parser.parse(stream, new DefaultHandler(), metadata);
101         } finally {
102             stream.close();
103         }
104         assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
105     }
106 
107     public void testEXCELExtraction() throws Exception {
108         final String expected = "Numbers and their Squares";
109         File file = getResourceAsFile("/test-documents/testEXCEL.xls");
110         String s1 = ParseUtils.getStringContent(file, tc);
111         String s2 = ParseUtils.getStringContent(file, tc,
112                 "application/vnd.ms-excel");
113         assertEquals(s1, s2);
114         assertTrue("Text does not contain '" + expected + "'", s1
115                 .contains(expected));
116         Parser parser = tc.getParser("application/vnd.ms-excel");
117         Metadata metadata = new Metadata();
118         InputStream stream = new FileInputStream(file);
119         try {
120             parser.parse(stream, new DefaultHandler(), metadata);
121         } finally {
122             stream.close();
123         }
124         assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
125     }
126 
127     public void testOOExtraction() throws Exception {
128         File file = getResourceAsFile("/test-documents/testOpenOffice2.odt");
129         String s1 = ParseUtils.getStringContent(file, tc);
130         String s2 = ParseUtils.getStringContent(file, tc,
131                 "application/vnd.oasis.opendocument.text");
132         assertEquals(s1, s2);
133     }
134 
135     public void testOutlookExtraction() throws Exception {
136         File file = getResourceAsFile("/test-documents/test-outlook.msg");
137         String s1 = ParseUtils.getStringContent(file, tc);
138         String s2 = ParseUtils.getStringContent(file, tc,
139                 "application/vnd.ms-outlook");
140         assertEquals(s1, s2);
141     }
142 
143     public void testHTMLExtraction() throws Exception {
144         File file = getResourceAsFile("/test-documents/testHTML.html");
145         String s1 = ParseUtils.getStringContent(file, tc);
146         String s2 = ParseUtils.getStringContent(file, tc, "text/html");
147         assertEquals(s1, s2);
148 
149         Parser parser = tc.getParser("text/html");
150         assertNotNull(parser);
151     }
152 
153     public void testZipFileExtraction() throws Exception {
154         File file = getResourceAsFile("/test-documents/test-documents.zip");
155         String s1 = ParseUtils.getStringContent(file, tc);
156         String s2 = ParseUtils.getStringContent(file, tc, "application/zip");
157         assertEquals(s1, s2);
158 
159         Parser parser = tc.getParser("application/zip");
160         assertNotNull(parser);
161     }
162 
163     public void testMP3Extraction() throws Exception {
164         File file = getResourceAsFile("/test-documents/testMP3id3v1.mp3");
165         String s1 = ParseUtils.getStringContent(file, tc);
166         String s2 = ParseUtils.getStringContent(file, tc, "audio/mpeg");
167         assertEquals(s1, s2);
168 
169         Parser parser = tc.getParser("audio/mpeg");
170         assertNotNull(parser);
171     }
172 
173     /**
174      * This method will give you back the filename incl. the absolute path name
175      * to the resource. If the resource does not exist it will give you back the
176      * resource name incl. the path.
177      * 
178      * @param name
179      *            The named resource to search for.
180      * @return an absolute path incl. the name which is in the same directory as
181      *         the the class you've called it from.
182      */
183     public String getFileResource(String name) {
184         URL url = this.getClass().getResource(name);
185         if (url != null) {
186             return url.getFile();
187         } else {
188             // We have a file which does not exists
189             // We got the path
190             url = this.getClass().getResource(".");
191             return url.getFile() + name;
192         }
193     }
194 
195     public File getResourceAsFile(String filename) {
196         return new File(getFileResource(filename));
197     }
198 
199     public InputStream getResourceAsStream(String name) {
200         return this.getClass().getResourceAsStream(name);
201     }
202 
203 }