1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft.ooxml;
18
19 import java.io.IOException;
20 import java.util.Iterator;
21 import java.util.Locale;
22
23 import org.apache.poi.hssf.extractor.ExcelExtractor;
24 import org.apache.poi.ss.usermodel.Cell;
25 import org.apache.poi.ss.usermodel.CellStyle;
26 import org.apache.poi.ss.usermodel.Comment;
27 import org.apache.poi.ss.usermodel.DataFormatter;
28 import org.apache.poi.ss.usermodel.HeaderFooter;
29 import org.apache.poi.ss.usermodel.Row;
30 import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
31 import org.apache.poi.xssf.usermodel.XSSFCell;
32 import org.apache.poi.xssf.usermodel.XSSFSheet;
33 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
34 import org.apache.tika.sax.XHTMLContentHandler;
35 import org.apache.tika.metadata.Metadata;
36 import org.apache.tika.metadata.TikaMetadataKeys;
37 import org.apache.tika.exception.TikaException;
38 import org.apache.xmlbeans.XmlException;
39 import org.xml.sax.SAXException;
40
41 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
42
43
44
45
46 private final DataFormatter formatter = new DataFormatter();
47
48 private final XSSFExcelExtractor extractor;
49 private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
50
51 public XSSFExcelExtractorDecorator(
52 XSSFExcelExtractor extractor, Locale locale) {
53 super(extractor, TYPE);
54
55 this.extractor = extractor;
56 }
57
58
59
60
61 @Override
62 protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
63 XmlException, IOException {
64 XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
65
66 for (int i = 0; i < document.getNumberOfSheets(); i++) {
67 xhtml.startElement("div");
68 XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
69 xhtml.element("h1", document.getSheetName(i));
70
71
72 extractHeaderFooter(sheet.getFirstHeader(), xhtml);
73 extractHeaderFooter(sheet.getOddHeader(), xhtml);
74 extractHeaderFooter(sheet.getEvenHeader(), xhtml);
75
76 xhtml.startElement("table");
77 xhtml.startElement("tbody");
78
79
80 for (Object rawR : sheet) {
81 xhtml.startElement("tr");
82 Row row = (Row) rawR;
83 for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
84 xhtml.startElement("td");
85 Cell cell = ri.next();
86
87 int type = cell.getCellType();
88 if (type == Cell.CELL_TYPE_FORMULA) {
89 type = cell.getCachedFormulaResultType();
90 }
91 if (type == Cell.CELL_TYPE_STRING) {
92 xhtml.characters(cell.getRichStringCellValue()
93 .getString());
94 } else if (type == Cell.CELL_TYPE_NUMERIC) {
95 CellStyle style = cell.getCellStyle();
96 xhtml.characters(
97 formatter.formatRawCellContents(cell.getNumericCellValue(),
98 style.getDataFormat(),
99 style.getDataFormatString()));
100 } else {
101 XSSFCell xc = (XSSFCell) cell;
102 String rawValue = xc.getRawValue();
103 if (rawValue != null) {
104 xhtml.characters(rawValue);
105 }
106
107 }
108
109
110 Comment comment = cell.getCellComment();
111 if (comment != null) {
112 xhtml.characters(comment.getString().getString());
113 }
114
115 xhtml.endElement("td");
116 }
117 xhtml.endElement("tr");
118 }
119
120 xhtml.endElement("tbody");
121 xhtml.endElement("table");
122
123
124 extractHeaderFooter(sheet.getFirstFooter(), xhtml);
125 extractHeaderFooter(sheet.getOddFooter(), xhtml);
126 extractHeaderFooter(sheet.getEvenFooter(), xhtml);
127
128 xhtml.endElement("div");
129 }
130 }
131
132 private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler xhtml)
133 throws SAXException {
134 String content = ExcelExtractor._extractHeaderFooter(hf);
135 if (content.length() > 0) {
136 xhtml.element("p", content);
137 }
138 }
139
140 @Override
141 public MetadataExtractor getMetadataExtractor() {
142 return new MetadataExtractor(extractor, TYPE) {
143 @Override
144 public void extract(Metadata metadata) throws TikaException {
145 super.extract(metadata);
146
147 metadata.set(TikaMetadataKeys.PROTECTED, "false");
148
149 XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
150
151 for (int i = 0; i < document.getNumberOfSheets(); i++) {
152 XSSFSheet sheet = document.getSheetAt(i);
153
154 if (sheet.getProtect()) {
155 metadata.set(TikaMetadataKeys.PROTECTED, "true");
156 }
157 }
158 }
159 };
160 }
161 }