1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.pkg;
18  
19  import java.io.InputStream;
20  
21  import junit.framework.TestCase;
22  
23  import org.apache.tika.metadata.Metadata;
24  import org.apache.tika.parser.AutoDetectParser;
25  import org.apache.tika.parser.Parser;
26  import org.apache.tika.sax.BodyContentHandler;
27  import org.xml.sax.ContentHandler;
28  
29  /**
30   * Test case for parsing gzip files.
31   */
32  public class GzipParserTest extends TestCase {
33  
34      public void testGzipParsing() throws Exception {
35          Parser parser = new AutoDetectParser(); // Should auto-detect!
36          ContentHandler handler = new BodyContentHandler();
37          Metadata metadata = new Metadata();
38  
39          InputStream stream = GzipParserTest.class.getResourceAsStream(
40                  "/test-documents/test-documents.tgz");
41          try {
42              parser.parse(stream, handler, metadata);
43          } finally {
44              stream.close();
45          }
46  
47          assertEquals("application/x-gzip", metadata.get(Metadata.CONTENT_TYPE));
48          String content = handler.toString();
49          assertTrue(content.contains("test-documents/testEXCEL.xls"));
50          assertTrue(content.contains("Sample Excel Worksheet"));
51          assertTrue(content.contains("test-documents/testHTML.html"));
52          assertTrue(content.contains("Test Indexation Html"));
53          assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
54          assertTrue(content.contains("This is a sample Open Office document"));
55          assertTrue(content.contains("test-documents/testPDF.pdf"));
56          assertTrue(content.contains("Apache Tika"));
57          assertTrue(content.contains("test-documents/testPPT.ppt"));
58          assertTrue(content.contains("Sample Powerpoint Slide"));
59          assertTrue(content.contains("test-documents/testRTF.rtf"));
60          assertTrue(content.contains("indexation Word"));
61          assertTrue(content.contains("test-documents/testTXT.txt"));
62          assertTrue(content.contains("Test d'indexation de Txt"));
63          assertTrue(content.contains("test-documents/testWORD.doc"));
64          assertTrue(content.contains("This is a sample Microsoft Word Document"));
65          assertTrue(content.contains("test-documents/testXML.xml"));
66          assertTrue(content.contains("Rida Benjelloun"));
67      }
68  
69      public void testSvgzParsing() throws Exception {
70          Parser parser = new AutoDetectParser(); // Should auto-detect!
71          ContentHandler handler = new BodyContentHandler();
72          Metadata metadata = new Metadata();
73  
74          InputStream stream = GzipParserTest.class.getResourceAsStream(
75                  "/test-documents/testSVG.svgz");
76          try {
77              parser.parse(stream, handler, metadata);
78          } finally {
79              stream.close();
80          }
81  
82          assertEquals("application/x-gzip", metadata.get(Metadata.CONTENT_TYPE));
83          String content = handler.toString();
84          assertTrue(content.contains("Test SVG image"));
85      }
86  
87  }