View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft.ooxml;
18  
19  import java.io.IOException;
20  
21  import org.apache.poi.xslf.XSLFSlideShow;
22  import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
23  import org.apache.poi.xslf.usermodel.XMLSlideShow;
24  import org.apache.poi.xslf.usermodel.XSLFSlide;
25  import org.apache.tika.sax.XHTMLContentHandler;
26  import org.apache.xmlbeans.XmlException;
27  import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
28  import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
29  import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
30  import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
31  import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
32  import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
33  import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
34  import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
35  import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
36  import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
37  import org.xml.sax.SAXException;
38  
39  public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
40  
41      public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) {
42          super(extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
43      }
44  
45      /**
46       * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
47       */
48      @Override
49      protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
50              XmlException, IOException {
51          XSLFSlideShow slideShow = (XSLFSlideShow) extractor.getDocument();
52          XMLSlideShow xmlSlideShow = new XMLSlideShow(slideShow);
53  
54          XSLFSlide[] slides = xmlSlideShow.getSlides();
55          for (XSLFSlide slide : slides) {
56              CTSlide rawSlide = slide._getCTSlide();
57              CTSlideIdListEntry slideId = slide._getCTSlideId();
58  
59              CTNotesSlide notes = xmlSlideShow._getXSLFSlideShow().getNotes(
60                      slideId);
61              CTCommentList comments = xmlSlideShow._getXSLFSlideShow()
62                      .getSlideComments(slideId);
63  
64              xhtml.startElement("div");
65              extractShapeContent(rawSlide.getCSld().getSpTree(), xhtml);
66  
67              if (comments != null) {
68                  for (CTComment comment : comments.getCmArray()) {
69                      xhtml.element("p", comment.getText());
70                  }
71              }
72  
73              if (notes != null) {
74                  extractShapeContent(notes.getCSld().getSpTree(), xhtml);
75              }
76              xhtml.endElement("div");
77          }
78      }
79  
80      private void extractShapeContent(CTGroupShape gs, XHTMLContentHandler xhtml)
81              throws SAXException {
82          CTShape[] shapes = gs.getSpArray();
83          for (CTShape shape : shapes) {
84              CTTextBody textBody = shape.getTxBody();
85              if (textBody != null) {
86                  CTTextParagraph[] paras = textBody.getPArray();
87                  for (CTTextParagraph textParagraph : paras) {
88                      CTRegularTextRun[] textRuns = textParagraph.getRArray();
89                      for (CTRegularTextRun textRun : textRuns) {
90                          xhtml.element("p", textRun.getT());
91                      }
92                  }
93              }
94          }
95      }
96  }