View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft.ooxml;
18  
19  import java.io.IOException;
20  import java.util.Iterator;
21  
22  import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
23  import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
24  import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
25  import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
26  import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
27  import org.apache.poi.xwpf.usermodel.XWPFDocument;
28  import org.apache.poi.xwpf.usermodel.XWPFParagraph;
29  import org.apache.tika.sax.XHTMLContentHandler;
30  import org.apache.xmlbeans.XmlException;
31  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
32  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
33  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
34  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
35  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
36  import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
37  import org.xml.sax.SAXException;
38  
39  public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
40  
41      public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
42          super(extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
43      }
44  
45      /**
46       * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
47       */
48      @Override
49      protected void buildXHTML(XHTMLContentHandler xhtml)
50              throws SAXException, XmlException, IOException {
51          XWPFDocument document = (XWPFDocument) extractor.getDocument();
52          XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
53  
54          // headers
55          extractHeaders(xhtml, hfPolicy);
56  
57          // first all paragraphs
58          Iterator<XWPFParagraph> i = document.getParagraphsIterator();
59          while (i.hasNext()) {
60              XWPFParagraph paragraph = i.next();
61  
62              CTSectPr ctSectPr = null;
63              if (paragraph.getCTP().getPPr() != null) {
64                  ctSectPr = paragraph.getCTP().getPPr().getSectPr();
65              }
66  
67              XWPFHeaderFooterPolicy headerFooterPolicy = null;
68  
69              if (ctSectPr != null) {
70                  headerFooterPolicy =
71                      new XWPFHeaderFooterPolicy(document, ctSectPr);
72                  extractHeaders(xhtml, headerFooterPolicy);
73              }
74  
75              XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
76                      new XWPFHyperlinkDecorator(paragraph, null, true));
77  
78              CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray();
79              for (CTBookmark bookmark : bookmarks) {
80                  xhtml.element("p", bookmark.getName());
81              }
82  
83              xhtml.element("p", decorator.getText());
84  
85              if (ctSectPr != null) {
86                  extractFooters(xhtml, headerFooterPolicy);
87              }
88          }
89  
90          // then all document tables
91          extractTableContent(document, xhtml);
92          extractFooters(xhtml, hfPolicy);
93      }
94  
95      private void extractFooters(
96              XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
97              throws SAXException {
98          // footers
99          if (hfPolicy.getFirstPageFooter() != null) {
100             xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
101         }
102         if (hfPolicy.getEvenPageFooter() != null) {
103             xhtml.element("p", hfPolicy.getEvenPageFooter().getText());
104         }
105         if (hfPolicy.getDefaultFooter() != null) {
106             xhtml.element("p", hfPolicy.getDefaultFooter().getText());
107         }
108     }
109 
110     private void extractHeaders(
111             XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
112             throws SAXException {
113         if (hfPolicy.getFirstPageHeader() != null) {
114             xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
115         }
116         if (hfPolicy.getEvenPageHeader() != null) {
117             xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
118         }
119         if (hfPolicy.getDefaultHeader() != null) {
120             xhtml.element("p", hfPolicy.getDefaultHeader().getText());
121         }
122     }
123 
124     /**
125      * Low level structured parsing of document tables.
126      */
127     private void extractTableContent(XWPFDocument doc, XHTMLContentHandler xhtml)
128             throws SAXException {
129         for (CTTbl table : doc.getDocument().getBody().getTblArray()) {
130             xhtml.startElement("table");
131             xhtml.startElement("tbody");
132             CTRow[] rows = table.getTrArray();
133             for (CTRow row : rows) {
134                 xhtml.startElement("tr");
135                 CTTc[] cells = row.getTcArray();
136                 for (CTTc tc : cells) {
137                     xhtml.startElement("td");
138                     CTP[] content = tc.getPArray();
139                     for (CTP ctp : content) {
140                         XWPFParagraph p = new MyXWPFParagraph(ctp, doc);
141 
142                         XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
143                                 new XWPFHyperlinkDecorator(p, null, true));
144 
145                         xhtml.element("p", decorator.getText());
146                     }
147 
148                     xhtml.endElement("td");
149                 }
150                 xhtml.endElement("tr");
151             }
152             xhtml.endElement("tbody");
153             xhtml.endElement("table");
154         }
155     }
156 
157     /**
158      * Private wrapper class that makes the protected {@link XWPFParagraph}
159      * constructor available.
160      */
161     private static class MyXWPFParagraph extends XWPFParagraph {
162         private MyXWPFParagraph(CTP ctp, XWPFDocument xwpfDocument) {
163             super(ctp, xwpfDocument);
164         }
165     }
166 }