View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft;
18  
19  import java.io.FileNotFoundException;
20  import java.io.IOException;
21  import java.util.Date;
22  
23  import org.apache.poi.hpsf.CustomProperties;
24  import org.apache.poi.hpsf.DocumentSummaryInformation;
25  import org.apache.poi.hpsf.MarkUnsupportedException;
26  import org.apache.poi.hpsf.NoPropertySetStreamException;
27  import org.apache.poi.hpsf.PropertySet;
28  import org.apache.poi.hpsf.SummaryInformation;
29  import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
30  import org.apache.poi.poifs.filesystem.DocumentEntry;
31  import org.apache.poi.poifs.filesystem.DocumentInputStream;
32  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
33  import org.apache.tika.exception.TikaException;
34  import org.apache.tika.metadata.Metadata;
35  
36  /**
37   * Outlook Message Parser.
38   */
39  class SummaryExtractor {
40  
41      private static final String SUMMARY_INFORMATION =
42          SummaryInformation.DEFAULT_STREAM_NAME;
43  
44      private static final String DOCUMENT_SUMMARY_INFORMATION =
45          DocumentSummaryInformation.DEFAULT_STREAM_NAME;
46  
47      private final Metadata metadata;
48  
49      public SummaryExtractor(Metadata metadata) {
50          this.metadata = metadata;
51      }
52  
53      public void parseSummaries(POIFSFileSystem filesystem)
54              throws IOException, TikaException {
55          parseSummaryEntryIfExists(filesystem, SUMMARY_INFORMATION);
56          parseSummaryEntryIfExists(filesystem, DOCUMENT_SUMMARY_INFORMATION);
57      }
58  
59      private void parseSummaryEntryIfExists(
60              POIFSFileSystem filesystem, String entryName)
61              throws IOException, TikaException {
62          try {
63              DocumentEntry entry =
64                  (DocumentEntry) filesystem.getRoot().getEntry(entryName);
65              PropertySet properties =
66                  new PropertySet(new DocumentInputStream(entry));
67              if (properties.isSummaryInformation()) {
68                  parse(new SummaryInformation(properties));
69              }
70              if (properties.isDocumentSummaryInformation()) {
71                  parse(new DocumentSummaryInformation(properties));
72              }
73          } catch (FileNotFoundException e) {
74              // entry does not exist, just skip it
75          } catch (NoPropertySetStreamException e) {
76              throw new TikaException("Not a HPSF document", e);
77          } catch (UnexpectedPropertySetTypeException e) {
78              throw new TikaException("Unexpected HPSF document", e);
79          } catch (MarkUnsupportedException e) {
80              throw new TikaException("Invalid DocumentInputStream", e);
81          }
82      }
83  
84      private void parse(SummaryInformation summary) {
85          set(Metadata.TITLE, summary.getTitle());
86          set(Metadata.AUTHOR, summary.getAuthor());
87          set(Metadata.KEYWORDS, summary.getKeywords());
88          set(Metadata.SUBJECT, summary.getSubject());
89          set(Metadata.LAST_AUTHOR, summary.getLastAuthor());
90          set(Metadata.COMMENTS, summary.getComments());
91          set(Metadata.TEMPLATE, summary.getTemplate());
92          set(Metadata.APPLICATION_NAME, summary.getApplicationName());
93          set(Metadata.REVISION_NUMBER, summary.getRevNumber());
94          set(Metadata.CREATION_DATE, summary.getCreateDateTime());
95          set(Metadata.CHARACTER_COUNT, summary.getCharCount());
96          set(Metadata.EDIT_TIME, summary.getEditTime());
97          set(Metadata.LAST_SAVED, summary.getLastSaveDateTime());
98          set(Metadata.PAGE_COUNT, summary.getPageCount());
99          set(Metadata.SECURITY, summary.getSecurity());
100         set(Metadata.WORD_COUNT, summary.getWordCount());
101         set(Metadata.LAST_PRINTED, summary.getLastPrinted());
102     }
103 
104     private void parse(DocumentSummaryInformation summary) {
105         set(Metadata.COMPANY, summary.getCompany());
106         set(Metadata.MANAGER, summary.getManager());
107         set(Metadata.LANGUAGE, getLanguage(summary));
108         set(Metadata.CATEGORY, summary.getCategory());
109     }
110 
111     private String getLanguage(DocumentSummaryInformation summary) {
112         CustomProperties customProperties = summary.getCustomProperties();
113         if (customProperties != null) {
114             Object value = customProperties.get("Language");
115             if (value instanceof String) {
116                 return (String) value;
117             }
118         }
119         return null;
120     }
121 
122     private void set(String name, String value) {
123         if (value != null) {
124             metadata.set(name, value);
125         }
126     }
127 
128     private void set(String name, Date value) {
129         if (value != null) {
130             metadata.set(name, value.toString());
131         }
132     }
133 
134     private void set(String name, long value) {
135         if (value > 0) {
136             metadata.set(name, Long.toString(value));
137         }
138     }
139 
140 }