View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft;
18  
19  import java.io.IOException;
20  
21  import org.apache.poi.hsmf.datatypes.Chunks;
22  import org.apache.poi.hsmf.datatypes.StringChunk;
23  import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
24  import org.apache.poi.hsmf.parsers.POIFSChunkParser;
25  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
26  import org.apache.tika.exception.TikaException;
27  import org.apache.tika.metadata.Metadata;
28  import org.apache.tika.sax.XHTMLContentHandler;
29  import org.xml.sax.SAXException;
30  
31  /**
32   * Outlook Message Parser.
33   */
34  class OutlookExtractor {
35  
36      private final Chunks chunks;
37  
38      private final POIFSChunkParser parser;
39  
40      public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
41          try {
42              this.parser = new POIFSChunkParser(filesystem);
43              this.chunks = parser.identifyChunks();
44          } catch (IOException e) {
45              throw new TikaException("Failed to parse Outlook chunks", e);
46          }
47      }
48  
49      public void parse(XHTMLContentHandler xhtml, Metadata metadata)
50              throws TikaException, SAXException {
51          String subject = getChunk(chunks.subjectChunk);
52          String from = getChunk(chunks.displayFromChunk);
53  
54          metadata.set(Metadata.AUTHOR, from);
55          metadata.set(Metadata.TITLE, subject);
56          metadata.set(Metadata.SUBJECT, getChunk(chunks.conversationTopic));
57  
58          xhtml.element("h1", subject);
59  
60          xhtml.startElement("dl");
61          header(xhtml, "From", from);
62          header(xhtml, "To", getChunk(chunks.displayToChunk));
63          header(xhtml, "Cc", getChunk(chunks.displayCCChunk));
64          header(xhtml, "Bcc", getChunk(chunks.displayBCCChunk));
65          xhtml.endElement("dl");
66  
67          xhtml.element("p", getChunk(chunks.textBodyChunk));
68      }
69  
70      private void header(XHTMLContentHandler xhtml, String key, String value)
71              throws SAXException {
72          if (value.length() > 0) {
73              xhtml.element("dt", key);
74              xhtml.element("dd", value);
75          }
76      }
77  
78      /**
79       * Returns the content of the identified string chunk in the
80       * current document. Returns the empty string if the identified
81       * chunk does not exist in the current document.
82       *
83       * @param chunk string chunk identifier
84       * @return content of the identified chunk, or the empty string
85       */
86      private String getChunk(StringChunk chunk) {
87          try {
88              return parser.getDocumentNode(chunk).toString();
89          } catch (ChunkNotFoundException e) {
90              return "";
91          }
92      }
93  
94  }