1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft.ooxml;
18
19 import java.io.IOException;
20 import java.util.Iterator;
21
22 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
23 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
24 import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
25 import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
26 import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
27 import org.apache.poi.xwpf.usermodel.XWPFDocument;
28 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
29 import org.apache.tika.sax.XHTMLContentHandler;
30 import org.apache.xmlbeans.XmlException;
31 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
32 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
33 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
34 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
35 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
36 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
37 import org.xml.sax.SAXException;
38
39 public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
40
41 public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
42 super(extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
43 }
44
45
46
47
48 @Override
49 protected void buildXHTML(XHTMLContentHandler xhtml)
50 throws SAXException, XmlException, IOException {
51 XWPFDocument document = (XWPFDocument) extractor.getDocument();
52 XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
53
54
55 extractHeaders(xhtml, hfPolicy);
56
57
58 Iterator<XWPFParagraph> i = document.getParagraphsIterator();
59 while (i.hasNext()) {
60 XWPFParagraph paragraph = i.next();
61
62 CTSectPr ctSectPr = null;
63 if (paragraph.getCTP().getPPr() != null) {
64 ctSectPr = paragraph.getCTP().getPPr().getSectPr();
65 }
66
67 XWPFHeaderFooterPolicy headerFooterPolicy = null;
68
69 if (ctSectPr != null) {
70 headerFooterPolicy =
71 new XWPFHeaderFooterPolicy(document, ctSectPr);
72 extractHeaders(xhtml, headerFooterPolicy);
73 }
74
75 XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
76 new XWPFHyperlinkDecorator(paragraph, null, true));
77
78 CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray();
79 for (CTBookmark bookmark : bookmarks) {
80 xhtml.element("p", bookmark.getName());
81 }
82
83 xhtml.element("p", decorator.getText());
84
85 if (ctSectPr != null) {
86 extractFooters(xhtml, headerFooterPolicy);
87 }
88 }
89
90
91 extractTableContent(document, xhtml);
92 extractFooters(xhtml, hfPolicy);
93 }
94
95 private void extractFooters(
96 XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
97 throws SAXException {
98
99 if (hfPolicy.getFirstPageFooter() != null) {
100 xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
101 }
102 if (hfPolicy.getEvenPageFooter() != null) {
103 xhtml.element("p", hfPolicy.getEvenPageFooter().getText());
104 }
105 if (hfPolicy.getDefaultFooter() != null) {
106 xhtml.element("p", hfPolicy.getDefaultFooter().getText());
107 }
108 }
109
110 private void extractHeaders(
111 XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
112 throws SAXException {
113 if (hfPolicy.getFirstPageHeader() != null) {
114 xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
115 }
116 if (hfPolicy.getEvenPageHeader() != null) {
117 xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
118 }
119 if (hfPolicy.getDefaultHeader() != null) {
120 xhtml.element("p", hfPolicy.getDefaultHeader().getText());
121 }
122 }
123
124
125
126
127 private void extractTableContent(XWPFDocument doc, XHTMLContentHandler xhtml)
128 throws SAXException {
129 for (CTTbl table : doc.getDocument().getBody().getTblArray()) {
130 xhtml.startElement("table");
131 xhtml.startElement("tbody");
132 CTRow[] rows = table.getTrArray();
133 for (CTRow row : rows) {
134 xhtml.startElement("tr");
135 CTTc[] cells = row.getTcArray();
136 for (CTTc tc : cells) {
137 xhtml.startElement("td");
138 CTP[] content = tc.getPArray();
139 for (CTP ctp : content) {
140 XWPFParagraph p = new MyXWPFParagraph(ctp, doc);
141
142 XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
143 new XWPFHyperlinkDecorator(p, null, true));
144
145 xhtml.element("p", decorator.getText());
146 }
147
148 xhtml.endElement("td");
149 }
150 xhtml.endElement("tr");
151 }
152 xhtml.endElement("tbody");
153 xhtml.endElement("table");
154 }
155 }
156
157
158
159
160
161 private static class MyXWPFParagraph extends XWPFParagraph {
162 private MyXWPFParagraph(CTP ctp, XWPFDocument xwpfDocument) {
163 super(ctp, xwpfDocument);
164 }
165 }
166 }