View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft;
18  
19  import java.awt.Point;
20  import java.io.IOException;
21  import java.text.NumberFormat;
22  import java.util.ArrayList;
23  import java.util.Comparator;
24  import java.util.List;
25  import java.util.Locale;
26  import java.util.Map;
27  import java.util.SortedMap;
28  import java.util.TreeMap;
29  
30  import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
31  import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
32  import org.apache.poi.hssf.eventusermodel.HSSFListener;
33  import org.apache.poi.hssf.eventusermodel.HSSFRequest;
34  import org.apache.poi.hssf.record.BOFRecord;
35  import org.apache.poi.hssf.record.BoundSheetRecord;
36  import org.apache.poi.hssf.record.CellValueRecordInterface;
37  import org.apache.poi.hssf.record.CountryRecord;
38  import org.apache.poi.hssf.record.DateWindow1904Record;
39  import org.apache.poi.hssf.record.EOFRecord;
40  import org.apache.poi.hssf.record.ExtendedFormatRecord;
41  import org.apache.poi.hssf.record.FormatRecord;
42  import org.apache.poi.hssf.record.FormulaRecord;
43  import org.apache.poi.hssf.record.HyperlinkRecord;
44  import org.apache.poi.hssf.record.TextObjectRecord;
45  import org.apache.poi.hssf.record.UnicodeString;
46  import org.apache.poi.hssf.record.LabelRecord;
47  import org.apache.poi.hssf.record.LabelSSTRecord;
48  import org.apache.poi.hssf.record.NumberRecord;
49  import org.apache.poi.hssf.record.RKRecord;
50  import org.apache.poi.hssf.record.Record;
51  import org.apache.poi.hssf.record.SSTRecord;
52  import org.apache.poi.poifs.filesystem.DocumentInputStream;
53  import org.apache.poi.poifs.filesystem.POIFSFileSystem;
54  import org.apache.tika.sax.XHTMLContentHandler;
55  import org.xml.sax.SAXException;
56  
57  /**
58   * Excel parser implementation which uses POI's Event API
59   * to handle the contents of a Workbook.
60   * <p>
61   * The Event API uses a much smaller memory footprint than
62   * <code>HSSFWorkbook</code> when processing excel files
63   * but at the cost of more complexity.
64   * <p>
65   * With the Event API a <i>listener</i> is registered for
66   * specific record types and those records are created,
67   * fired off to the listener and then discarded as the stream
68   * is being processed.
69   *
70   * @see org.apache.poi.hssf.eventusermodel.HSSFListener
71   * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
72   * POI Event API How To</a>
73   */
74  public class ExcelExtractor {
75  
76      /**
77       * <code>true</code> if the HSSFListener should be registered
78       * to listen for all records or <code>false</code> (the default)
79       * if the listener should be configured to only receive specified
80       * records.
81       */
82      private boolean listenForAllRecords = false;
83  
84      /**
85       * Returns <code>true</code> if this parser is configured to listen
86       * for all records instead of just the specified few.
87       */
88      public boolean isListenForAllRecords() {
89          return listenForAllRecords;
90      }
91  
92      /**
93       * Specifies whether this parser should to listen for all
94       * records or just for the specified few.
95       * <p>
96       * <strong>Note:</strong> Under normal operation this setting should
97       * be <code>false</code> (the default), but you can experiment with
98       * this setting for testing and debugging purposes.
99       *
100      * @param listenForAllRecords <code>true</code> if the HSSFListener
101      * should be registered to listen for all records or <code>false</code>
102      * if the listener should be configured to only receive specified records.
103      */
104     public void setListenForAllRecords(boolean listenForAllRecords) {
105         this.listenForAllRecords = listenForAllRecords;
106     }
107 
108     /**
109      * Extracts text from an Excel Workbook writing the extracted content
110      * to the specified {@link Appendable}.
111      *
112      * @param filesystem POI file system
113      * @throws IOException if an error occurs processing the workbook
114      * or writing the extracted content
115      */
116     protected void parse(
117             POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
118             Locale locale) throws IOException, SAXException {
119     	TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale);
120     	listener.processFile(filesystem, isListenForAllRecords());
121         listener.throwStoredException();
122     }
123 
124     // ======================================================================
125 
126     /**
127      * HSSF Listener implementation which processes the HSSF records.
128      */
129     private static class TikaHSSFListener implements HSSFListener {
130 
131         /**
132          * XHTML content handler to which the document content is rendered.
133          */
134         private final XHTMLContentHandler handler;
135 
136         /**
137          * Potential exception thrown by the content handler. When set to
138          * non-<code>null</code>, causes all subsequent HSSF records to be
139          * ignored and the stored exception to be thrown when
140          * {@link #throwStoredException()} is invoked.
141          */
142         private SAXException exception = null;
143 
144         private SSTRecord sstRecord;
145 
146         /**
147          * Internal <code>FormatTrackingHSSFListener</code> to handle cell
148          * formatting within the extraction.
149          */
150         private FormatTrackingHSSFListener formatListener;
151 
152         /**
153          * List of worksheet names.
154          */
155         private List<String> sheetNames = new ArrayList<String>();
156 
157         /**
158          * Index of the current worksheet within the workbook.
159          * Used to find the worksheet name in the {@link #sheetNames} list.
160          */
161         private short currentSheetIndex;
162 
163         /**
164          * Content of the current worksheet, or <code>null</code> if no
165          * worksheet is currently active.
166          */
167         private SortedMap<Point, Cell> currentSheet = null;
168 
169         /**
170          * Format for rendering numbers in the worksheet. Currently we just
171          * use the platform default formatting.
172          *
173          * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
174          */
175         private final NumberFormat format;
176 
177         /**
178          * Construct a new listener instance outputting parsed data to
179          * the specified XHTML content handler.
180          *
181          * @param handler Destination to write the parsed output to
182          */
183         private TikaHSSFListener(XHTMLContentHandler handler, Locale locale) {
184             this.handler = handler;
185             this.format = NumberFormat.getInstance(locale);
186         }
187 
188         /**
189          * Entry point to listener to start the processing of a file.
190          *
191          * @param filesystem POI file system.
192          * @param listenForAllRecords sets whether the listener is configured to listen
193          * for all records types or not.
194          * @throws IOException on any IO errors.
195          * @throws SAXException on any SAX parsing errors.
196          */
197     	public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
198     		throws IOException,	SAXException {
199 
200     		// Set up listener and register the records we want to process
201     		formatListener = new FormatTrackingHSSFListener(this);
202             HSSFRequest hssfRequest = new HSSFRequest();
203             if (listenForAllRecords) {
204                 hssfRequest.addListenerForAllRecords(formatListener);
205             } else {
206                 hssfRequest.addListener(formatListener, BOFRecord.sid);
207                 hssfRequest.addListener(formatListener, EOFRecord.sid);
208                 hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
209                 hssfRequest.addListener(formatListener, CountryRecord.sid);
210                 hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
211                 hssfRequest.addListener(formatListener, SSTRecord.sid);
212                 hssfRequest.addListener(formatListener, FormulaRecord.sid);
213                 hssfRequest.addListener(formatListener, LabelRecord.sid);
214                 hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
215                 hssfRequest.addListener(formatListener, NumberRecord.sid);
216                 hssfRequest.addListener(formatListener, RKRecord.sid);
217                 hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
218                 hssfRequest.addListener(formatListener, TextObjectRecord.sid);
219                 hssfRequest.addListener(formatListener, FormatRecord.sid);
220                 hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
221             }
222 
223             // Create event factory and process Workbook (fire events)
224             DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
225             HSSFEventFactory eventFactory = new HSSFEventFactory();
226             eventFactory.processEvents(hssfRequest, documentInputStream);
227     	}
228 
229         /**
230          * Process a HSSF record.
231          *
232          * @param record HSSF Record
233          */
234         public void processRecord(Record record) {
235             if (exception == null) {
236                 try {
237                     internalProcessRecord(record);
238                 } catch (SAXException e) {
239                     exception = e;
240                 }
241             }
242         }
243 
244         public void throwStoredException() throws SAXException {
245             if (exception != null) {
246                 throw exception;
247             }
248         }
249 
250         private void internalProcessRecord(Record record) throws SAXException {
251             switch (record.getSid()) {
252             case BOFRecord.sid: // start of workbook, worksheet etc. records
253                 BOFRecord bof = (BOFRecord) record;
254                 if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
255                     currentSheetIndex = -1;
256                 } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
257                     currentSheetIndex++;
258                     currentSheet =
259                         new TreeMap<Point, Cell>(new PointComparator());
260                 }
261                 break;
262 
263             case EOFRecord.sid: // end of workbook, worksheet etc. records
264                 if (currentSheet != null) {
265                     processSheet();
266                 }
267                 currentSheet = null;
268                 break;
269 
270             case BoundSheetRecord.sid: // Worksheet index record
271                 BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
272                 sheetNames.add(boundSheetRecord.getSheetname());
273                 break;
274 
275             case SSTRecord.sid: // holds all the strings for LabelSSTRecords
276                 sstRecord = (SSTRecord) record;
277                 break;
278 
279             case FormulaRecord.sid: // Cell value from a formula
280                 FormulaRecord formula = (FormulaRecord) record;
281                 addCell(record, new NumberCell(formula.getValue(), format));
282                 break;
283 
284             case LabelRecord.sid: // strings stored directly in the cell
285                 LabelRecord label = (LabelRecord) record;
286                 addTextCell(record, label.getValue());
287                 break;
288 
289             case LabelSSTRecord.sid: // Ref. a string in the shared string table
290                 LabelSSTRecord sst = (LabelSSTRecord) record;
291                 UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
292                 addTextCell(record, unicode.getString());
293                 break;
294 
295             case NumberRecord.sid: // Contains a numeric cell value
296                 NumberRecord number = (NumberRecord) record;
297                 addTextCell(record, formatListener.formatNumberDateCell(number));
298                 break;
299 
300             case RKRecord.sid: // Excel internal number record
301                 RKRecord rk = (RKRecord) record;
302                 addCell(record, new NumberCell(rk.getRKNumber(), format));
303                 break;
304 
305             case HyperlinkRecord.sid: // holds a URL associated with a cell
306                 if (currentSheet != null) {
307                     HyperlinkRecord link = (HyperlinkRecord) record;
308                     Point point =
309                         new Point(link.getFirstColumn(), link.getFirstRow());
310                     Cell cell = currentSheet.get(point);
311                     if (cell != null) {
312                         addCell(record, new LinkedCell(cell, link.getAddress()));
313                     }
314                 }
315                 break;
316             case TextObjectRecord.sid:
317                 TextObjectRecord tor = (TextObjectRecord) record;
318                 addTextCell(record, tor.getStr().getString());
319                 break;
320             }
321         }
322 
323         /**
324          * Adds the given cell (unless <code>null</code>) to the current
325          * worksheet (if any) at the position (if any) of the given record.
326          *
327          * @param record record that holds the cell value
328          * @param cell cell value (or <code>null</code>)
329          */
330         private void addCell(Record record, Cell cell) throws SAXException {
331             if (cell == null) {
332                 // Ignore empty cells
333             } else if (currentSheet != null
334                     && record instanceof CellValueRecordInterface) {
335                 // Normal cell inside a worksheet
336                 CellValueRecordInterface value =
337                     (CellValueRecordInterface) record;
338                 Point point = new Point(value.getColumn(), value.getRow());
339                 currentSheet.put(point, cell);
340             } else {
341                 // Cell outside the worksheets
342                 handler.startElement("div", "class", "outside");
343                 cell.render(handler);
344                 handler.endElement("div");
345             }
346         }
347 
348         /**
349          * Adds a text cell with the given text comment. The given text
350          * is trimmed, and ignored if <code>null</code> or empty.
351          *
352          * @param record record that holds the text value
353          * @param text text content, may be <code>null</code>
354          * @throws SAXException
355          */
356         private void addTextCell(Record record, String text) throws SAXException {
357             if (text != null) {
358                 text = text.trim();
359                 if (text.length() > 0) {
360                     addCell(record, new TextCell(text));
361                 }
362             }
363         }
364 
365         /**
366          * Process an excel sheet.
367          *
368          * @throws SAXException if an error occurs
369          */
370         private void processSheet() throws SAXException {
371             // Sheet Start
372             handler.startElement("div", "class", "page");
373             if (currentSheetIndex < sheetNames.size()) {
374                 handler.element("h1", sheetNames.get(currentSheetIndex));
375             }
376             handler.startElement("table");
377             handler.startElement("tbody");
378 
379             // Process Rows
380             int currentRow = 0;
381             int currentColumn = 0;
382             handler.startElement("tr");
383             handler.startElement("td");
384             for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
385                 while (currentRow < entry.getKey().y) {
386                     handler.endElement("td");
387                     handler.endElement("tr");
388                     handler.startElement("tr");
389                     handler.startElement("td");
390                     currentRow++;
391                     currentColumn = 0;
392                 }
393 
394                 while (currentColumn < entry.getKey().x) {
395                     handler.endElement("td");
396                     handler.startElement("td");
397                     currentColumn++;
398                 }
399 
400                 entry.getValue().render(handler);
401             }
402             handler.endElement("td");
403             handler.endElement("tr");
404 
405             // Sheet End
406             handler.endElement("tbody");
407             handler.endElement("table");
408             handler.endElement("div");
409         }
410 
411     }
412 
413     /**
414      * Utility comparator for points.
415      */
416     private static class PointComparator implements Comparator<Point> {
417 
418         public int compare(Point a, Point b) {
419             int diff = a.y - b.y;
420             if (diff == 0) {
421                 diff = a.x - b.x;
422             }
423             return diff;
424         }
425 
426     }
427 
428 }