View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.microsoft.ooxml;
18  
19  import java.io.IOException;
20  import java.util.Iterator;
21  import java.util.Locale;
22  
23  import org.apache.poi.hssf.extractor.ExcelExtractor;
24  import org.apache.poi.ss.usermodel.Cell;
25  import org.apache.poi.ss.usermodel.CellStyle;
26  import org.apache.poi.ss.usermodel.Comment;
27  import org.apache.poi.ss.usermodel.DataFormatter;
28  import org.apache.poi.ss.usermodel.HeaderFooter;
29  import org.apache.poi.ss.usermodel.Row;
30  import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
31  import org.apache.poi.xssf.usermodel.XSSFCell;
32  import org.apache.poi.xssf.usermodel.XSSFSheet;
33  import org.apache.poi.xssf.usermodel.XSSFWorkbook;
34  import org.apache.tika.sax.XHTMLContentHandler;
35  import org.apache.tika.metadata.Metadata;
36  import org.apache.tika.metadata.TikaMetadataKeys;
37  import org.apache.tika.exception.TikaException;
38  import org.apache.xmlbeans.XmlException;
39  import org.xml.sax.SAXException;
40  
41  public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
42  
43      /**
44       * Internal <code>DataFormatter</code> for formatting Numbers.
45       */
46  	private final DataFormatter formatter = new DataFormatter();
47  
48      private final XSSFExcelExtractor extractor;
49      private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
50  
51      public XSSFExcelExtractorDecorator(
52              XSSFExcelExtractor extractor, Locale locale) {
53          super(extractor, TYPE);
54  
55          this.extractor = extractor;
56      }
57  
58      /**
59       * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
60       */
61      @Override
62      protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
63              XmlException, IOException {
64          XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
65  
66          for (int i = 0; i < document.getNumberOfSheets(); i++) {
67              xhtml.startElement("div");
68              XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
69              xhtml.element("h1", document.getSheetName(i));
70  
71              // Header(s), if present
72              extractHeaderFooter(sheet.getFirstHeader(), xhtml);
73              extractHeaderFooter(sheet.getOddHeader(), xhtml);
74              extractHeaderFooter(sheet.getEvenHeader(), xhtml);
75  
76              xhtml.startElement("table");
77              xhtml.startElement("tbody");
78  
79              // Rows and cells
80              for (Object rawR : sheet) {
81                  xhtml.startElement("tr");
82                  Row row = (Row) rawR;
83                  for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
84                      xhtml.startElement("td");
85                      Cell cell = ri.next();
86  
87                      int type = cell.getCellType();
88                      if (type == Cell.CELL_TYPE_FORMULA) {
89                          type = cell.getCachedFormulaResultType();
90                      }
91                      if (type == Cell.CELL_TYPE_STRING) {
92                          xhtml.characters(cell.getRichStringCellValue()
93                                  .getString());
94                      } else if (type == Cell.CELL_TYPE_NUMERIC) {
95                      	CellStyle style = cell.getCellStyle();
96                  	    xhtml.characters(
97                  	    	formatter.formatRawCellContents(cell.getNumericCellValue(),
98  															style.getDataFormat(),
99  															style.getDataFormatString()));
100                     } else {
101                         XSSFCell xc = (XSSFCell) cell;
102                         String rawValue = xc.getRawValue();
103                         if (rawValue != null) {
104                             xhtml.characters(rawValue);
105                         }
106 
107                     }
108 
109                     // Output the comment in the same cell as the content
110                     Comment comment = cell.getCellComment();
111                     if (comment != null) {
112                         xhtml.characters(comment.getString().getString());
113                     }
114 
115                     xhtml.endElement("td");
116                 }
117                 xhtml.endElement("tr");
118             }
119 
120             xhtml.endElement("tbody");
121             xhtml.endElement("table");
122 
123             // Finally footer(s), if present
124             extractHeaderFooter(sheet.getFirstFooter(), xhtml);
125             extractHeaderFooter(sheet.getOddFooter(), xhtml);
126             extractHeaderFooter(sheet.getEvenFooter(), xhtml);
127 
128             xhtml.endElement("div");
129         }
130     }
131 
132     private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler xhtml)
133             throws SAXException {
134         String content = ExcelExtractor._extractHeaderFooter(hf);
135         if (content.length() > 0) {
136             xhtml.element("p", content);
137         }
138     }
139 
140     @Override
141     public MetadataExtractor getMetadataExtractor() {
142         return new MetadataExtractor(extractor, TYPE) {
143             @Override
144             public void extract(Metadata metadata) throws TikaException {
145                 super.extract(metadata);
146 
147                 metadata.set(TikaMetadataKeys.PROTECTED, "false");
148 
149                 XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
150 
151                 for (int i = 0; i < document.getNumberOfSheets(); i++) {
152                     XSSFSheet sheet = document.getSheetAt(i);
153 
154                     if (sheet.getProtect()) {
155                         metadata.set(TikaMetadataKeys.PROTECTED, "true");
156                     }
157                 }
158             }
159         };
160     }
161 }