1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.microsoft;
18
19 import java.awt.Point;
20 import java.io.IOException;
21 import java.text.NumberFormat;
22 import java.util.ArrayList;
23 import java.util.Comparator;
24 import java.util.List;
25 import java.util.Locale;
26 import java.util.Map;
27 import java.util.SortedMap;
28 import java.util.TreeMap;
29
30 import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
31 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
32 import org.apache.poi.hssf.eventusermodel.HSSFListener;
33 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
34 import org.apache.poi.hssf.record.BOFRecord;
35 import org.apache.poi.hssf.record.BoundSheetRecord;
36 import org.apache.poi.hssf.record.CellValueRecordInterface;
37 import org.apache.poi.hssf.record.CountryRecord;
38 import org.apache.poi.hssf.record.DateWindow1904Record;
39 import org.apache.poi.hssf.record.EOFRecord;
40 import org.apache.poi.hssf.record.ExtendedFormatRecord;
41 import org.apache.poi.hssf.record.FormatRecord;
42 import org.apache.poi.hssf.record.FormulaRecord;
43 import org.apache.poi.hssf.record.HyperlinkRecord;
44 import org.apache.poi.hssf.record.TextObjectRecord;
45 import org.apache.poi.hssf.record.UnicodeString;
46 import org.apache.poi.hssf.record.LabelRecord;
47 import org.apache.poi.hssf.record.LabelSSTRecord;
48 import org.apache.poi.hssf.record.NumberRecord;
49 import org.apache.poi.hssf.record.RKRecord;
50 import org.apache.poi.hssf.record.Record;
51 import org.apache.poi.hssf.record.SSTRecord;
52 import org.apache.poi.poifs.filesystem.DocumentInputStream;
53 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
54 import org.apache.tika.sax.XHTMLContentHandler;
55 import org.xml.sax.SAXException;
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 public class ExcelExtractor {
75
76
77
78
79
80
81
82 private boolean listenForAllRecords = false;
83
84
85
86
87
88 public boolean isListenForAllRecords() {
89 return listenForAllRecords;
90 }
91
92
93
94
95
96
97
98
99
100
101
102
103
104 public void setListenForAllRecords(boolean listenForAllRecords) {
105 this.listenForAllRecords = listenForAllRecords;
106 }
107
108
109
110
111
112
113
114
115
116 protected void parse(
117 POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
118 Locale locale) throws IOException, SAXException {
119 TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale);
120 listener.processFile(filesystem, isListenForAllRecords());
121 listener.throwStoredException();
122 }
123
124
125
126
127
128
129 private static class TikaHSSFListener implements HSSFListener {
130
131
132
133
134 private final XHTMLContentHandler handler;
135
136
137
138
139
140
141
142 private SAXException exception = null;
143
144 private SSTRecord sstRecord;
145
146
147
148
149
150 private FormatTrackingHSSFListener formatListener;
151
152
153
154
155 private List<String> sheetNames = new ArrayList<String>();
156
157
158
159
160
161 private short currentSheetIndex;
162
163
164
165
166
167 private SortedMap<Point, Cell> currentSheet = null;
168
169
170
171
172
173
174
175 private final NumberFormat format;
176
177
178
179
180
181
182
183 private TikaHSSFListener(XHTMLContentHandler handler, Locale locale) {
184 this.handler = handler;
185 this.format = NumberFormat.getInstance(locale);
186 }
187
188
189
190
191
192
193
194
195
196
197 public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
198 throws IOException, SAXException {
199
200
201 formatListener = new FormatTrackingHSSFListener(this);
202 HSSFRequest hssfRequest = new HSSFRequest();
203 if (listenForAllRecords) {
204 hssfRequest.addListenerForAllRecords(formatListener);
205 } else {
206 hssfRequest.addListener(formatListener, BOFRecord.sid);
207 hssfRequest.addListener(formatListener, EOFRecord.sid);
208 hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
209 hssfRequest.addListener(formatListener, CountryRecord.sid);
210 hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
211 hssfRequest.addListener(formatListener, SSTRecord.sid);
212 hssfRequest.addListener(formatListener, FormulaRecord.sid);
213 hssfRequest.addListener(formatListener, LabelRecord.sid);
214 hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
215 hssfRequest.addListener(formatListener, NumberRecord.sid);
216 hssfRequest.addListener(formatListener, RKRecord.sid);
217 hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
218 hssfRequest.addListener(formatListener, TextObjectRecord.sid);
219 hssfRequest.addListener(formatListener, FormatRecord.sid);
220 hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
221 }
222
223
224 DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
225 HSSFEventFactory eventFactory = new HSSFEventFactory();
226 eventFactory.processEvents(hssfRequest, documentInputStream);
227 }
228
229
230
231
232
233
234 public void processRecord(Record record) {
235 if (exception == null) {
236 try {
237 internalProcessRecord(record);
238 } catch (SAXException e) {
239 exception = e;
240 }
241 }
242 }
243
244 public void throwStoredException() throws SAXException {
245 if (exception != null) {
246 throw exception;
247 }
248 }
249
250 private void internalProcessRecord(Record record) throws SAXException {
251 switch (record.getSid()) {
252 case BOFRecord.sid:
253 BOFRecord bof = (BOFRecord) record;
254 if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
255 currentSheetIndex = -1;
256 } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
257 currentSheetIndex++;
258 currentSheet =
259 new TreeMap<Point, Cell>(new PointComparator());
260 }
261 break;
262
263 case EOFRecord.sid:
264 if (currentSheet != null) {
265 processSheet();
266 }
267 currentSheet = null;
268 break;
269
270 case BoundSheetRecord.sid:
271 BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
272 sheetNames.add(boundSheetRecord.getSheetname());
273 break;
274
275 case SSTRecord.sid:
276 sstRecord = (SSTRecord) record;
277 break;
278
279 case FormulaRecord.sid:
280 FormulaRecord formula = (FormulaRecord) record;
281 addCell(record, new NumberCell(formula.getValue(), format));
282 break;
283
284 case LabelRecord.sid:
285 LabelRecord label = (LabelRecord) record;
286 addTextCell(record, label.getValue());
287 break;
288
289 case LabelSSTRecord.sid:
290 LabelSSTRecord sst = (LabelSSTRecord) record;
291 UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
292 addTextCell(record, unicode.getString());
293 break;
294
295 case NumberRecord.sid:
296 NumberRecord number = (NumberRecord) record;
297 addTextCell(record, formatListener.formatNumberDateCell(number));
298 break;
299
300 case RKRecord.sid:
301 RKRecord rk = (RKRecord) record;
302 addCell(record, new NumberCell(rk.getRKNumber(), format));
303 break;
304
305 case HyperlinkRecord.sid:
306 if (currentSheet != null) {
307 HyperlinkRecord link = (HyperlinkRecord) record;
308 Point point =
309 new Point(link.getFirstColumn(), link.getFirstRow());
310 Cell cell = currentSheet.get(point);
311 if (cell != null) {
312 addCell(record, new LinkedCell(cell, link.getAddress()));
313 }
314 }
315 break;
316 case TextObjectRecord.sid:
317 TextObjectRecord tor = (TextObjectRecord) record;
318 addTextCell(record, tor.getStr().getString());
319 break;
320 }
321 }
322
323
324
325
326
327
328
329
330 private void addCell(Record record, Cell cell) throws SAXException {
331 if (cell == null) {
332
333 } else if (currentSheet != null
334 && record instanceof CellValueRecordInterface) {
335
336 CellValueRecordInterface value =
337 (CellValueRecordInterface) record;
338 Point point = new Point(value.getColumn(), value.getRow());
339 currentSheet.put(point, cell);
340 } else {
341
342 handler.startElement("div", "class", "outside");
343 cell.render(handler);
344 handler.endElement("div");
345 }
346 }
347
348
349
350
351
352
353
354
355
356 private void addTextCell(Record record, String text) throws SAXException {
357 if (text != null) {
358 text = text.trim();
359 if (text.length() > 0) {
360 addCell(record, new TextCell(text));
361 }
362 }
363 }
364
365
366
367
368
369
370 private void processSheet() throws SAXException {
371
372 handler.startElement("div", "class", "page");
373 if (currentSheetIndex < sheetNames.size()) {
374 handler.element("h1", sheetNames.get(currentSheetIndex));
375 }
376 handler.startElement("table");
377 handler.startElement("tbody");
378
379
380 int currentRow = 0;
381 int currentColumn = 0;
382 handler.startElement("tr");
383 handler.startElement("td");
384 for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
385 while (currentRow < entry.getKey().y) {
386 handler.endElement("td");
387 handler.endElement("tr");
388 handler.startElement("tr");
389 handler.startElement("td");
390 currentRow++;
391 currentColumn = 0;
392 }
393
394 while (currentColumn < entry.getKey().x) {
395 handler.endElement("td");
396 handler.startElement("td");
397 currentColumn++;
398 }
399
400 entry.getValue().render(handler);
401 }
402 handler.endElement("td");
403 handler.endElement("tr");
404
405
406 handler.endElement("tbody");
407 handler.endElement("table");
408 handler.endElement("div");
409 }
410
411 }
412
413
414
415
416 private static class PointComparator implements Comparator<Point> {
417
418 public int compare(Point a, Point b) {
419 int diff = a.y - b.y;
420 if (diff == 0) {
421 diff = a.x - b.x;
422 }
423 return diff;
424 }
425
426 }
427
428 }