1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.parser.microsoft.ooxml; 18 19 import java.io.IOException; 20 21 import org.apache.poi.POIXMLDocument; 22 import org.apache.poi.POIXMLTextExtractor; 23 import org.apache.tika.metadata.Metadata; 24 import org.apache.tika.sax.XHTMLContentHandler; 25 import org.apache.xmlbeans.XmlException; 26 import org.xml.sax.ContentHandler; 27 import org.xml.sax.SAXException; 28 29 /** 30 * Base class for all Tika OOXML extractors. 31 * 32 * Tika extractors decorate POI extractors so that the parsed content of 33 * documents is returned as a sequence of XHTML SAX events. Subclasses must 34 * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that 35 * populates the {@link XHTMLContentHandler} object received as parameter. 36 */ 37 public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { 38 protected POIXMLTextExtractor extractor; 39 40 private final String type; 41 42 public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) { 43 this.extractor = extractor; 44 this.type = type; 45 } 46 47 /** 48 * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument() 49 */ 50 public POIXMLDocument getDocument() { 51 return extractor.getDocument(); 52 } 53 54 /** 55 * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor() 56 */ 57 public MetadataExtractor getMetadataExtractor() { 58 return new MetadataExtractor(extractor, type); 59 } 60 61 /** 62 * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler, 63 * org.apache.tika.metadata.Metadata) 64 */ 65 public void getXHTML(ContentHandler handler, Metadata metadata) 66 throws SAXException, XmlException, IOException { 67 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); 68 xhtml.startDocument(); 69 buildXHTML(xhtml); 70 xhtml.endDocument(); 71 } 72 73 /** 74 * Populates the {@link XHTMLContentHandler} object received as parameter. 75 */ 76 protected abstract void buildXHTML(XHTMLContentHandler xhtml) 77 throws SAXException, XmlException, IOException; 78 }