1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.pkg;
18  
19  import java.io.InputStream;
20  
21  import junit.framework.TestCase;
22  
23  import org.apache.tika.metadata.Metadata;
24  import org.apache.tika.parser.AutoDetectParser;
25  import org.apache.tika.parser.Parser;
26  import org.apache.tika.sax.BodyContentHandler;
27  import org.xml.sax.ContentHandler;
28  
29  /**
30   * Test case for parsing tar files.
31   */
32  public class TarParserTest extends TestCase {
33  
34      public void testTarParsing() throws Exception {
35          Parser parser = new AutoDetectParser(); // Should auto-detect!
36          ContentHandler handler = new BodyContentHandler();
37          Metadata metadata = new Metadata();
38  
39          InputStream stream = TarParserTest.class.getResourceAsStream(
40                  "/test-documents/test-documents.tar");
41          try {
42              parser.parse(stream, handler, metadata);
43          } finally {
44              stream.close();
45          }
46  
47          assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
48          String content = handler.toString();
49          assertTrue(content.contains("test-documents/testEXCEL.xls"));
50          assertTrue(content.contains("Sample Excel Worksheet"));
51          assertTrue(content.contains("test-documents/testHTML.html"));
52          assertTrue(content.contains("Test Indexation Html"));
53          assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
54          assertTrue(content.contains("This is a sample Open Office document"));
55          assertTrue(content.contains("test-documents/testPDF.pdf"));
56          assertTrue(content.contains("Apache Tika"));
57          assertTrue(content.contains("test-documents/testPPT.ppt"));
58          assertTrue(content.contains("Sample Powerpoint Slide"));
59          assertTrue(content.contains("test-documents/testRTF.rtf"));
60          assertTrue(content.contains("indexation Word"));
61          assertTrue(content.contains("test-documents/testTXT.txt"));
62          assertTrue(content.contains("Test d'indexation de Txt"));
63          assertTrue(content.contains("test-documents/testWORD.doc"));
64          assertTrue(content.contains("This is a sample Microsoft Word Document"));
65          assertTrue(content.contains("test-documents/testXML.xml"));
66          assertTrue(content.contains("Rida Benjelloun"));
67      }
68  
69  }