1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft.ooxml;
18
19 import java.io.IOException;
20
21 import org.apache.poi.xslf.XSLFSlideShow;
22 import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
23 import org.apache.poi.xslf.usermodel.XMLSlideShow;
24 import org.apache.poi.xslf.usermodel.XSLFSlide;
25 import org.apache.tika.sax.XHTMLContentHandler;
26 import org.apache.xmlbeans.XmlException;
27 import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
28 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
29 import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
30 import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
31 import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
32 import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
33 import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
34 import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
35 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
36 import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
37 import org.xml.sax.SAXException;
38
39 public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
40
41 public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) {
42 super(extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
43 }
44
45
46
47
48 @Override
49 protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
50 XmlException, IOException {
51 XSLFSlideShow slideShow = (XSLFSlideShow) extractor.getDocument();
52 XMLSlideShow xmlSlideShow = new XMLSlideShow(slideShow);
53
54 XSLFSlide[] slides = xmlSlideShow.getSlides();
55 for (XSLFSlide slide : slides) {
56 CTSlide rawSlide = slide._getCTSlide();
57 CTSlideIdListEntry slideId = slide._getCTSlideId();
58
59 CTNotesSlide notes = xmlSlideShow._getXSLFSlideShow().getNotes(
60 slideId);
61 CTCommentList comments = xmlSlideShow._getXSLFSlideShow()
62 .getSlideComments(slideId);
63
64 xhtml.startElement("div");
65 extractShapeContent(rawSlide.getCSld().getSpTree(), xhtml);
66
67 if (comments != null) {
68 for (CTComment comment : comments.getCmArray()) {
69 xhtml.element("p", comment.getText());
70 }
71 }
72
73 if (notes != null) {
74 extractShapeContent(notes.getCSld().getSpTree(), xhtml);
75 }
76 xhtml.endElement("div");
77 }
78 }
79
80 private void extractShapeContent(CTGroupShape gs, XHTMLContentHandler xhtml)
81 throws SAXException {
82 CTShape[] shapes = gs.getSpArray();
83 for (CTShape shape : shapes) {
84 CTTextBody textBody = shape.getTxBody();
85 if (textBody != null) {
86 CTTextParagraph[] paras = textBody.getPArray();
87 for (CTTextParagraph textParagraph : paras) {
88 CTRegularTextRun[] textRuns = textParagraph.getRArray();
89 for (CTRegularTextRun textRun : textRuns) {
90 xhtml.element("p", textRun.getT());
91 }
92 }
93 }
94 }
95 }
96 }