1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.epub;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Collections;
22 import java.util.Set;
23 import java.util.zip.ZipEntry;
24 import java.util.zip.ZipInputStream;
25
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.io.IOUtils;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.mime.MediaType;
30 import org.apache.tika.parser.ParseContext;
31 import org.apache.tika.parser.Parser;
32 import org.apache.tika.parser.xml.DcXMLParser;
33 import org.xml.sax.ContentHandler;
34 import org.xml.sax.SAXException;
35 import org.xml.sax.helpers.DefaultHandler;
36
37
38
39
40 public class EpubParser implements Parser {
41
42 private static final Set<MediaType> SUPPORTED_TYPES =
43 Collections.singleton(MediaType.application("epub+zip"));
44
45 private Parser meta = new DcXMLParser();
46
47 private Parser content = new EpubContentParser();
48
49 public Parser getMetaParser() {
50 return meta;
51 }
52
53 public void setMetaParser(Parser meta) {
54 this.meta = meta;
55 }
56
57 public Parser getContentParser() {
58 return content;
59 }
60
61 public void setContentParser(Parser content) {
62 this.content = content;
63 }
64
65 public Set<MediaType> getSupportedTypes(ParseContext context) {
66 return SUPPORTED_TYPES;
67 }
68
69 public void parse(
70 InputStream stream, ContentHandler handler,
71 Metadata metadata, ParseContext context)
72 throws IOException, SAXException, TikaException {
73 ZipInputStream zip = new ZipInputStream(stream);
74 ZipEntry entry = zip.getNextEntry();
75 while (entry != null) {
76 if (entry.getName().equals("mimetype")) {
77 String type = IOUtils.toString(zip, "UTF-8");
78 metadata.set(Metadata.CONTENT_TYPE, type);
79 } else if (entry.getName().equals("metadata.xml")) {
80 meta.parse(zip, new DefaultHandler(), metadata, context);
81 } else if (entry.getName().endsWith(".opf")) {
82 meta.parse(zip, new DefaultHandler(), metadata, context);
83 } else if (entry.getName().endsWith(".html")) {
84 content.parse(zip, handler, metadata, context);
85 }
86 entry = zip.getNextEntry();
87 }
88 }
89
90
91
92
93 public void parse(
94 InputStream stream, ContentHandler handler, Metadata metadata)
95 throws IOException, SAXException, TikaException {
96 parse(stream, handler, metadata, new ParseContext());
97 }
98
99 }