1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft;
18
19 import java.io.IOException;
20
21 import org.apache.poi.hsmf.datatypes.Chunks;
22 import org.apache.poi.hsmf.datatypes.StringChunk;
23 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
24 import org.apache.poi.hsmf.parsers.POIFSChunkParser;
25 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.sax.XHTMLContentHandler;
29 import org.xml.sax.SAXException;
30
31
32
33
34 class OutlookExtractor {
35
36 private final Chunks chunks;
37
38 private final POIFSChunkParser parser;
39
40 public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
41 try {
42 this.parser = new POIFSChunkParser(filesystem);
43 this.chunks = parser.identifyChunks();
44 } catch (IOException e) {
45 throw new TikaException("Failed to parse Outlook chunks", e);
46 }
47 }
48
49 public void parse(XHTMLContentHandler xhtml, Metadata metadata)
50 throws TikaException, SAXException {
51 String subject = getChunk(chunks.subjectChunk);
52 String from = getChunk(chunks.displayFromChunk);
53
54 metadata.set(Metadata.AUTHOR, from);
55 metadata.set(Metadata.TITLE, subject);
56 metadata.set(Metadata.SUBJECT, getChunk(chunks.conversationTopic));
57
58 xhtml.element("h1", subject);
59
60 xhtml.startElement("dl");
61 header(xhtml, "From", from);
62 header(xhtml, "To", getChunk(chunks.displayToChunk));
63 header(xhtml, "Cc", getChunk(chunks.displayCCChunk));
64 header(xhtml, "Bcc", getChunk(chunks.displayBCCChunk));
65 xhtml.endElement("dl");
66
67 xhtml.element("p", getChunk(chunks.textBodyChunk));
68 }
69
70 private void header(XHTMLContentHandler xhtml, String key, String value)
71 throws SAXException {
72 if (value.length() > 0) {
73 xhtml.element("dt", key);
74 xhtml.element("dd", value);
75 }
76 }
77
78
79
80
81
82
83
84
85
86 private String getChunk(StringChunk chunk) {
87 try {
88 return parser.getDocumentNode(chunk).toString();
89 } catch (ChunkNotFoundException e) {
90 return "";
91 }
92 }
93
94 }