View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.util.Arrays;
22  import java.util.Collections;
23  import java.util.HashSet;
24  import java.util.Iterator;
25  import java.util.Locale;
26  import java.util.Set;
27  
28  import org.apache.poi.hdgf.extractor.VisioTextExtractor;
29  import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
30  import org.apache.poi.hslf.extractor.PowerPointExtractor;
31  import org.apache.poi.hwpf.extractor.WordExtractor;
32  import org.apache.poi.poifs.filesystem.DirectoryEntry;
33  import org.apache.poi.poifs.filesystem.DocumentEntry;
34  import org.apache.poi.poifs.filesystem.Entry;
35  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
36  import org.apache.tika.exception.TikaException;
37  import org.apache.tika.metadata.Metadata;
38  import org.apache.tika.mime.MediaType;
39  import org.apache.tika.parser.ParseContext;
40  import org.apache.tika.parser.Parser;
41  import org.apache.tika.sax.XHTMLContentHandler;
42  import org.xml.sax.ContentHandler;
43  import org.xml.sax.SAXException;
44  
45  /**
46   * Defines a Microsoft document content extractor.
47   */
48  public class OfficeParser implements Parser {
49  
50      private static final Set<MediaType> SUPPORTED_TYPES =
51          Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
52                  MediaType.application("x-tika-msoffice"),
53                  MediaType.application("vnd.visio"),
54                  MediaType.application("vnd.ms-powerpoint"),
55                  MediaType.application("vnd.ms-excel"),
56                  MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"),
57                  MediaType.application("msword"),
58                  MediaType.application("vnd.ms-outlook"))));
59  
60      public Set<MediaType> getSupportedTypes(ParseContext context) {
61          return SUPPORTED_TYPES;
62      }
63  
64      /**
65       * Extracts properties and text from an MS Document input stream
66       */
67      public void parse(
68              InputStream stream, ContentHandler handler,
69              Metadata metadata, ParseContext context)
70              throws IOException, SAXException, TikaException {
71          XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
72          xhtml.startDocument();
73  
74          POIFSFileSystem filesystem = new POIFSFileSystem(stream);
75  
76          // Parse summary entries first, to make metadata available early
77          new SummaryExtractor(metadata).parseSummaries(filesystem);
78  
79          // Parse remaining document entries
80          boolean outlookExtracted = false;
81          Iterator<?> entries = filesystem.getRoot().getEntries();
82          while (entries.hasNext()) {
83              Entry entry = (Entry) entries.next();
84              String name = entry.getName();
85              if (entry instanceof DirectoryEntry) {
86                 if ("Quill".equals(name)) {
87                    setType(metadata, "application/x-mspublisher");
88                    PublisherTextExtractor extractor =
89                        new PublisherTextExtractor(filesystem);
90                    xhtml.element("p", extractor.getText());
91                 }
92              } else if (entry instanceof DocumentEntry) {
93                 if ("WordDocument".equals(name)) {
94                     setType(metadata, "application/msword");
95                     WordExtractor extractor = new WordExtractor(filesystem);
96  
97                     addTextIfAny(xhtml, "header", extractor.getHeaderText());
98  
99                     for (String paragraph : extractor.getParagraphText()) {
100                        xhtml.element("p", paragraph);
101                    }
102 
103                    for (String paragraph : extractor.getFootnoteText()) {
104                        xhtml.element("p", paragraph);
105                    }
106 
107                    for (String paragraph : extractor.getCommentsText()) {
108                        xhtml.element("p", paragraph);
109                    }
110 
111                    for (String paragraph : extractor.getEndnoteText()) {
112                        xhtml.element("p", paragraph);
113                    }
114 
115                    addTextIfAny(xhtml, "footer", extractor.getFooterText());
116                } else if ("PowerPoint Document".equals(name)) {
117                    setType(metadata, "application/vnd.ms-powerpoint");
118                    PowerPointExtractor extractor =
119                        new PowerPointExtractor(filesystem);
120                    xhtml.element("p", extractor.getText(true, true));
121                } else if ("Workbook".equals(name)) {
122                    setType(metadata, "application/vnd.ms-excel");
123                    Locale locale = context.get(Locale.class, Locale.getDefault());
124                    new ExcelExtractor().parse(filesystem, xhtml, locale);
125                } else if ("VisioDocument".equals(name)) {
126                    setType(metadata, "application/vnd.visio");
127                    VisioTextExtractor extractor =
128                        new VisioTextExtractor(filesystem);
129                    for (String text : extractor.getAllText()) {
130                        xhtml.element("p", text);
131                    }
132                } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
133                    // TODO: Cleaner mechanism for detecting Outlook
134                    outlookExtracted = true;
135                    setType(metadata, "application/vnd.ms-outlook");
136                    new OutlookExtractor(filesystem).parse(xhtml, metadata);
137                }
138             }
139         }
140 
141         xhtml.endDocument();
142     }
143 
144     /**
145      * @deprecated This method will be removed in Apache Tika 1.0.
146      */
147     public void parse(
148             InputStream stream, ContentHandler handler, Metadata metadata)
149             throws IOException, SAXException, TikaException {
150         parse(stream, handler, metadata, new ParseContext());
151     }
152 
153     private void setType(Metadata metadata, String type) {
154         metadata.set(Metadata.CONTENT_TYPE, type);
155     }
156 
157     /**
158      * Outputs a section of text if the given text is non-empty.
159      *
160      * @param xhtml XHTML content handler
161      * @param section the class of the &lt;div/&gt; section emitted
162      * @param text text to be emitted, if any
163      * @throws SAXException if an error occurs
164      */
165     private void addTextIfAny(
166             XHTMLContentHandler xhtml, String section, String text)
167             throws SAXException {
168         if (text != null && text.length() > 0) {
169             xhtml.startElement("div", "class", section);
170             xhtml.element("p", text);
171             xhtml.endElement("div");
172         }
173     }
174 
175 }