View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft.ooxml;
18  
19  import java.io.IOException;
20  
21  import org.apache.poi.POIXMLDocument;
22  import org.apache.poi.POIXMLTextExtractor;
23  import org.apache.tika.metadata.Metadata;
24  import org.apache.tika.sax.XHTMLContentHandler;
25  import org.apache.xmlbeans.XmlException;
26  import org.xml.sax.ContentHandler;
27  import org.xml.sax.SAXException;
28  
29  /**
30   * Base class for all Tika OOXML extractors.
31   * 
32   * Tika extractors decorate POI extractors so that the parsed content of
33   * documents is returned as a sequence of XHTML SAX events. Subclasses must
34   * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
35   * populates the {@link XHTMLContentHandler} object received as parameter.
36   */
37  public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
38      protected POIXMLTextExtractor extractor;
39  
40      private final String type;
41  
42      public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) {
43          this.extractor = extractor;
44          this.type = type;
45      }
46  
47      /**
48       * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
49       */
50      public POIXMLDocument getDocument() {
51          return extractor.getDocument();
52      }
53  
54      /**
55       * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
56       */
57      public MetadataExtractor getMetadataExtractor() {
58          return new MetadataExtractor(extractor, type);
59      }
60  
61      /**
62       * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
63       *      org.apache.tika.metadata.Metadata)
64       */
65      public void getXHTML(ContentHandler handler, Metadata metadata)
66              throws SAXException, XmlException, IOException {
67          XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
68          xhtml.startDocument();
69          buildXHTML(xhtml);
70          xhtml.endDocument();
71      }
72  
73      /**
74       * Populates the {@link XHTMLContentHandler} object received as parameter.
75       */
76      protected abstract void buildXHTML(XHTMLContentHandler xhtml)
77              throws SAXException, XmlException, IOException;
78  }