1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft.ooxml;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.Locale;
22
23 import org.apache.poi.POIXMLDocument;
24 import org.apache.poi.POIXMLTextExtractor;
25 import org.apache.poi.extractor.ExtractorFactory;
26 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
27 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
28 import org.apache.poi.xslf.XSLFSlideShow;
29 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
30 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
31 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
32 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
33 import org.apache.poi.xwpf.usermodel.XWPFDocument;
34 import org.apache.tika.exception.TikaException;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.xmlbeans.XmlException;
37 import org.xml.sax.ContentHandler;
38 import org.xml.sax.SAXException;
39
40
41
42
43
44 public class OOXMLExtractorFactory {
45
46 public static void parse(
47 InputStream stream, ContentHandler handler,
48 Metadata metadata, Locale locale)
49 throws IOException, SAXException, TikaException {
50 try {
51 OOXMLExtractor extractor;
52
53 POIXMLTextExtractor poiExtractor =
54 (POIXMLTextExtractor) ExtractorFactory.createExtractor(stream);
55 POIXMLDocument document = poiExtractor.getDocument();
56 if (document instanceof XSLFSlideShow) {
57 extractor = new XSLFPowerPointExtractorDecorator(
58 (XSLFPowerPointExtractor) poiExtractor);
59 } else if (document instanceof XSSFWorkbook) {
60 extractor = new XSSFExcelExtractorDecorator(
61 (XSSFExcelExtractor) poiExtractor, locale);
62 } else if (document instanceof XWPFDocument) {
63 extractor = new XWPFWordExtractorDecorator(
64 (XWPFWordExtractor) poiExtractor);
65 } else {
66 extractor = new POIXMLTextExtractorDecorator(poiExtractor);
67 }
68
69 extractor.getMetadataExtractor().extract(metadata);
70 extractor.getXHTML(handler, metadata);
71 } catch (InvalidFormatException e) {
72 throw new TikaException("Error creating OOXML extractor", e);
73 } catch (OpenXML4JException e) {
74 throw new TikaException("Error creating OOXML extractor", e);
75 } catch (XmlException e) {
76 throw new TikaException("Error creating OOXML extractor", e);
77
78 }
79 }
80
81 }