View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.pkg;
18  
19  import java.io.BufferedInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.util.zip.GZIPInputStream;
23  
24  import org.apache.commons.compress.archivers.ArchiveEntry;
25  import org.apache.commons.compress.archivers.ArchiveInputStream;
26  import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
27  import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
28  import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
29  import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
30  import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
31  import org.apache.commons.compress.compressors.gzip.GzipUtils;
32  import org.apache.tika.exception.TikaException;
33  import org.apache.tika.io.CloseShieldInputStream;
34  import org.apache.tika.metadata.Metadata;
35  import org.apache.tika.parser.EmptyParser;
36  import org.apache.tika.parser.ParseContext;
37  import org.apache.tika.parser.Parser;
38  import org.apache.tika.sax.BodyContentHandler;
39  import org.apache.tika.sax.EmbeddedContentHandler;
40  import org.apache.tika.sax.XHTMLContentHandler;
41  import org.xml.sax.ContentHandler;
42  import org.xml.sax.SAXException;
43  
44  /**
45   * Extractor for packaging and compression formats.
46   */
47  class PackageExtractor {
48  
49      private final ContentHandler handler;
50  
51      private final Metadata metadata;
52  
53      private final ParseContext context;
54  
55      private final Parser parser;
56  
57      public PackageExtractor(
58              ContentHandler handler, Metadata metadata, ParseContext context) {
59          this.handler = handler;
60          this.metadata = metadata;
61          this.context = context;
62          this.parser = context.get(Parser.class, EmptyParser.INSTANCE);
63      }
64  
65      public void parse(InputStream stream)
66              throws IOException, SAXException, TikaException {
67          XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
68          xhtml.startDocument();
69  
70          // At the end we want to close the package/compression stream to
71          // release any associated resources, but the underlying document
72          // stream should not be closed
73          stream = new CloseShieldInputStream(stream);
74  
75          // Capture two bytes to determine the packaging/compression format
76          if (!stream.markSupported()) {
77              stream = new BufferedInputStream(stream);
78          }
79          stream.mark(2);
80          int a = stream.read();
81          int b = stream.read();
82          stream.reset();
83  
84          // Select decompression or unpacking mechanism based on the two bytes
85          if (a == 'B' && b == 'Z') {
86              metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
87              decompress(new BZip2CompressorInputStream(stream), xhtml);
88          } else if (a == 0x1f && b == 0x8b) {
89              metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
90              decompress(new GZIPInputStream(stream), xhtml);
91          } else if (a == 'P' && b == 'K') {
92              metadata.set(Metadata.CONTENT_TYPE, "application/zip");
93              unpack(new ZipArchiveInputStream(stream), xhtml);
94          } else if ((a == '0' && b == '7')
95                  || (a == 0x71 && b == 0xc7)
96                  || (a == 0xc7 && b == 0x71)) {
97              metadata.set(Metadata.CONTENT_TYPE, "application/x-cpio");
98              unpack(new CpioArchiveInputStream(stream), xhtml);
99          } else if (a == '=' && (b == '<' || b == '!')) {
100             metadata.set(Metadata.CONTENT_TYPE, "application/x-archive");
101             unpack(new ArArchiveInputStream(stream), xhtml);
102         } else {
103             metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
104             unpack(new TarArchiveInputStream(stream), xhtml);
105         }
106 
107         xhtml.endDocument();
108     }
109 
110     private void decompress(InputStream stream, XHTMLContentHandler xhtml)
111             throws IOException, SAXException, TikaException {
112         try {
113             Metadata entrydata = new Metadata();
114             String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
115             if (name != null) {
116                 if (name.endsWith(".tbz")) {
117                     name = name.substring(0, name.length() - 4) + ".tar";
118                 } else if (name.endsWith(".tbz2")) {
119                     name = name.substring(0, name.length() - 5) + ".tar";
120                 } else if (name.endsWith(".bz")) {
121                     name = name.substring(0, name.length() - 3);
122                 } else if (name.endsWith(".bz2")) {
123                     name = name.substring(0, name.length() - 4);
124                 } else if (name.length() > 0) {
125                     name = GzipUtils.getUncompressedFilename(name);
126                 }
127                 entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
128             }
129             // Use the delegate parser to parse the compressed document
130             parser.parse(
131                     new CloseShieldInputStream(stream),
132                     new EmbeddedContentHandler(
133                             new BodyContentHandler(xhtml)),
134                     entrydata, context);
135         } finally {
136             stream.close();
137         }
138     }
139 
140     /**
141      * Parses the given stream as a package of multiple underlying files.
142      * The package entries are parsed using the delegate parser instance.
143      * It is not an error if the entry can not be parsed, in that case
144      * just the entry name (if given) is emitted.
145      *
146      * @param stream package stream
147      * @param handler content handler
148      * @param metadata package metadata
149      * @throws IOException if an IO error occurs
150      * @throws SAXException if a SAX error occurs
151      */
152     public void unpack(ArchiveInputStream archive, XHTMLContentHandler xhtml)
153             throws IOException, SAXException {
154         try {
155             ArchiveEntry entry = archive.getNextEntry();
156             while (entry != null) {
157                 if (!entry.isDirectory()) {
158                     xhtml.startElement("div", "class", "package-entry");
159                     Metadata entrydata = new Metadata();
160                     String name = entry.getName();
161                     if (name != null && name.length() > 0) {
162                         entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
163                         xhtml.element("h1", name);
164                     }
165                     try {
166                         // Use the delegate parser to parse this entry
167                         parser.parse(
168                                 new CloseShieldInputStream(archive),
169                                 new EmbeddedContentHandler(
170                                         new BodyContentHandler(xhtml)),
171                                         entrydata, context);
172                     } catch (TikaException e) {
173                         // Could not parse the entry, just skip the content
174                     }
175                     xhtml.endElement("div");
176                 }
177                 entry = archive.getNextEntry();
178             }
179         } finally {
180             archive.close();
181         }
182     }
183 
184 }