1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Arrays;
22 import java.util.Collections;
23 import java.util.HashSet;
24 import java.util.Iterator;
25 import java.util.Locale;
26 import java.util.Set;
27
28 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
29 import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
30 import org.apache.poi.hslf.extractor.PowerPointExtractor;
31 import org.apache.poi.hwpf.extractor.WordExtractor;
32 import org.apache.poi.poifs.filesystem.DirectoryEntry;
33 import org.apache.poi.poifs.filesystem.DocumentEntry;
34 import org.apache.poi.poifs.filesystem.Entry;
35 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
36 import org.apache.tika.exception.TikaException;
37 import org.apache.tika.metadata.Metadata;
38 import org.apache.tika.mime.MediaType;
39 import org.apache.tika.parser.ParseContext;
40 import org.apache.tika.parser.Parser;
41 import org.apache.tika.sax.XHTMLContentHandler;
42 import org.xml.sax.ContentHandler;
43 import org.xml.sax.SAXException;
44
45
46
47
48 public class OfficeParser implements Parser {
49
50 private static final Set<MediaType> SUPPORTED_TYPES =
51 Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
52 MediaType.application("x-tika-msoffice"),
53 MediaType.application("vnd.visio"),
54 MediaType.application("vnd.ms-powerpoint"),
55 MediaType.application("vnd.ms-excel"),
56 MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"),
57 MediaType.application("msword"),
58 MediaType.application("vnd.ms-outlook"))));
59
60 public Set<MediaType> getSupportedTypes(ParseContext context) {
61 return SUPPORTED_TYPES;
62 }
63
64
65
66
67 public void parse(
68 InputStream stream, ContentHandler handler,
69 Metadata metadata, ParseContext context)
70 throws IOException, SAXException, TikaException {
71 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
72 xhtml.startDocument();
73
74 POIFSFileSystem filesystem = new POIFSFileSystem(stream);
75
76
77 new SummaryExtractor(metadata).parseSummaries(filesystem);
78
79
80 boolean outlookExtracted = false;
81 Iterator<?> entries = filesystem.getRoot().getEntries();
82 while (entries.hasNext()) {
83 Entry entry = (Entry) entries.next();
84 String name = entry.getName();
85 if (entry instanceof DirectoryEntry) {
86 if ("Quill".equals(name)) {
87 setType(metadata, "application/x-mspublisher");
88 PublisherTextExtractor extractor =
89 new PublisherTextExtractor(filesystem);
90 xhtml.element("p", extractor.getText());
91 }
92 } else if (entry instanceof DocumentEntry) {
93 if ("WordDocument".equals(name)) {
94 setType(metadata, "application/msword");
95 WordExtractor extractor = new WordExtractor(filesystem);
96
97 addTextIfAny(xhtml, "header", extractor.getHeaderText());
98
99 for (String paragraph : extractor.getParagraphText()) {
100 xhtml.element("p", paragraph);
101 }
102
103 for (String paragraph : extractor.getFootnoteText()) {
104 xhtml.element("p", paragraph);
105 }
106
107 for (String paragraph : extractor.getCommentsText()) {
108 xhtml.element("p", paragraph);
109 }
110
111 for (String paragraph : extractor.getEndnoteText()) {
112 xhtml.element("p", paragraph);
113 }
114
115 addTextIfAny(xhtml, "footer", extractor.getFooterText());
116 } else if ("PowerPoint Document".equals(name)) {
117 setType(metadata, "application/vnd.ms-powerpoint");
118 PowerPointExtractor extractor =
119 new PowerPointExtractor(filesystem);
120 xhtml.element("p", extractor.getText(true, true));
121 } else if ("Workbook".equals(name)) {
122 setType(metadata, "application/vnd.ms-excel");
123 Locale locale = context.get(Locale.class, Locale.getDefault());
124 new ExcelExtractor().parse(filesystem, xhtml, locale);
125 } else if ("VisioDocument".equals(name)) {
126 setType(metadata, "application/vnd.visio");
127 VisioTextExtractor extractor =
128 new VisioTextExtractor(filesystem);
129 for (String text : extractor.getAllText()) {
130 xhtml.element("p", text);
131 }
132 } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
133
134 outlookExtracted = true;
135 setType(metadata, "application/vnd.ms-outlook");
136 new OutlookExtractor(filesystem).parse(xhtml, metadata);
137 }
138 }
139 }
140
141 xhtml.endDocument();
142 }
143
144
145
146
147 public void parse(
148 InputStream stream, ContentHandler handler, Metadata metadata)
149 throws IOException, SAXException, TikaException {
150 parse(stream, handler, metadata, new ParseContext());
151 }
152
153 private void setType(Metadata metadata, String type) {
154 metadata.set(Metadata.CONTENT_TYPE, type);
155 }
156
157
158
159
160
161
162
163
164
165 private void addTextIfAny(
166 XHTMLContentHandler xhtml, String section, String text)
167 throws SAXException {
168 if (text != null && text.length() > 0) {
169 xhtml.startElement("div", "class", section);
170 xhtml.element("p", text);
171 xhtml.endElement("div");
172 }
173 }
174
175 }