1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.pkg;
18
19 import java.io.BufferedInputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.util.zip.GZIPInputStream;
23
24 import org.apache.commons.compress.archivers.ArchiveEntry;
25 import org.apache.commons.compress.archivers.ArchiveInputStream;
26 import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
27 import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
28 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
29 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
30 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
31 import org.apache.commons.compress.compressors.gzip.GzipUtils;
32 import org.apache.tika.exception.TikaException;
33 import org.apache.tika.io.CloseShieldInputStream;
34 import org.apache.tika.metadata.Metadata;
35 import org.apache.tika.parser.EmptyParser;
36 import org.apache.tika.parser.ParseContext;
37 import org.apache.tika.parser.Parser;
38 import org.apache.tika.sax.BodyContentHandler;
39 import org.apache.tika.sax.EmbeddedContentHandler;
40 import org.apache.tika.sax.XHTMLContentHandler;
41 import org.xml.sax.ContentHandler;
42 import org.xml.sax.SAXException;
43
44
45
46
47 class PackageExtractor {
48
49 private final ContentHandler handler;
50
51 private final Metadata metadata;
52
53 private final ParseContext context;
54
55 private final Parser parser;
56
57 public PackageExtractor(
58 ContentHandler handler, Metadata metadata, ParseContext context) {
59 this.handler = handler;
60 this.metadata = metadata;
61 this.context = context;
62 this.parser = context.get(Parser.class, EmptyParser.INSTANCE);
63 }
64
65 public void parse(InputStream stream)
66 throws IOException, SAXException, TikaException {
67 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
68 xhtml.startDocument();
69
70
71
72
73 stream = new CloseShieldInputStream(stream);
74
75
76 if (!stream.markSupported()) {
77 stream = new BufferedInputStream(stream);
78 }
79 stream.mark(2);
80 int a = stream.read();
81 int b = stream.read();
82 stream.reset();
83
84
85 if (a == 'B' && b == 'Z') {
86 metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
87 decompress(new BZip2CompressorInputStream(stream), xhtml);
88 } else if (a == 0x1f && b == 0x8b) {
89 metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
90 decompress(new GZIPInputStream(stream), xhtml);
91 } else if (a == 'P' && b == 'K') {
92 metadata.set(Metadata.CONTENT_TYPE, "application/zip");
93 unpack(new ZipArchiveInputStream(stream), xhtml);
94 } else if ((a == '0' && b == '7')
95 || (a == 0x71 && b == 0xc7)
96 || (a == 0xc7 && b == 0x71)) {
97 metadata.set(Metadata.CONTENT_TYPE, "application/x-cpio");
98 unpack(new CpioArchiveInputStream(stream), xhtml);
99 } else if (a == '=' && (b == '<' || b == '!')) {
100 metadata.set(Metadata.CONTENT_TYPE, "application/x-archive");
101 unpack(new ArArchiveInputStream(stream), xhtml);
102 } else {
103 metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
104 unpack(new TarArchiveInputStream(stream), xhtml);
105 }
106
107 xhtml.endDocument();
108 }
109
110 private void decompress(InputStream stream, XHTMLContentHandler xhtml)
111 throws IOException, SAXException, TikaException {
112 try {
113 Metadata entrydata = new Metadata();
114 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
115 if (name != null) {
116 if (name.endsWith(".tbz")) {
117 name = name.substring(0, name.length() - 4) + ".tar";
118 } else if (name.endsWith(".tbz2")) {
119 name = name.substring(0, name.length() - 5) + ".tar";
120 } else if (name.endsWith(".bz")) {
121 name = name.substring(0, name.length() - 3);
122 } else if (name.endsWith(".bz2")) {
123 name = name.substring(0, name.length() - 4);
124 } else if (name.length() > 0) {
125 name = GzipUtils.getUncompressedFilename(name);
126 }
127 entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
128 }
129
130 parser.parse(
131 new CloseShieldInputStream(stream),
132 new EmbeddedContentHandler(
133 new BodyContentHandler(xhtml)),
134 entrydata, context);
135 } finally {
136 stream.close();
137 }
138 }
139
140
141
142
143
144
145
146
147
148
149
150
151
152 public void unpack(ArchiveInputStream archive, XHTMLContentHandler xhtml)
153 throws IOException, SAXException {
154 try {
155 ArchiveEntry entry = archive.getNextEntry();
156 while (entry != null) {
157 if (!entry.isDirectory()) {
158 xhtml.startElement("div", "class", "package-entry");
159 Metadata entrydata = new Metadata();
160 String name = entry.getName();
161 if (name != null && name.length() > 0) {
162 entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
163 xhtml.element("h1", name);
164 }
165 try {
166
167 parser.parse(
168 new CloseShieldInputStream(archive),
169 new EmbeddedContentHandler(
170 new BodyContentHandler(xhtml)),
171 entrydata, context);
172 } catch (TikaException e) {
173
174 }
175 xhtml.endElement("div");
176 }
177 entry = archive.getNextEntry();
178 }
179 } finally {
180 archive.close();
181 }
182 }
183
184 }