View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.pdf;
18  
19  import java.io.IOException;
20  import java.util.List;
21  
22  import org.apache.pdfbox.pdmodel.PDDocument;
23  import org.apache.pdfbox.pdmodel.PDPage;
24  import org.apache.pdfbox.util.PDFOperator;
25  import org.apache.pdfbox.util.PDFTextStripper;
26  import org.apache.pdfbox.util.TextPosition;
27  import org.apache.pdfbox.util.operator.OperatorProcessor;
28  import org.apache.tika.exception.TikaException;
29  import org.apache.tika.io.IOExceptionWithCause;
30  import org.apache.tika.metadata.Metadata;
31  import org.apache.tika.sax.XHTMLContentHandler;
32  import org.xml.sax.ContentHandler;
33  import org.xml.sax.SAXException;
34  
35  /**
36   * Utility class that overrides the {@link PDFTextStripper} functionality
37   * to produce a semi-structured XHTML SAX events instead of a plain text
38   * stream.
39   */
40  class PDF2XHTML extends PDFTextStripper {
41  
42      /**
43       * Converts the given PDF document (and related metadata) to a stream
44       * of XHTML SAX events sent to the given content handler.
45       *
46       * @param document PDF document
47       * @param handler SAX content handler
48       * @param metadata PDF metadata
49       * @throws SAXException if the content handler fails to process SAX events
50       * @throws TikaException if the PDF document can not be processed
51       */
52      public static void process(
53              PDDocument document, ContentHandler handler, Metadata metadata)
54              throws SAXException, TikaException {
55          try {
56              new PDF2XHTML(handler, metadata).getText(document);
57          } catch (IOException e) {
58              if (e.getCause() instanceof SAXException) {
59                  throw (SAXException) e.getCause();
60              } else {
61                  throw new TikaException("Unable to extract PDF content", e);
62              }
63          }
64      }
65  
66      private final XHTMLContentHandler handler;
67  
68      private PDF2XHTML(ContentHandler handler, Metadata metadata)
69              throws IOException {
70          this.handler = new XHTMLContentHandler(handler, metadata);
71  
72          // TIKA-292: Ignore unneeded PDF operators
73          // TODO: Remove this once PDFBox is no longer so verbose
74          OperatorProcessor ignore = new OperatorProcessor() {
75              @Override @SuppressWarnings("unchecked")
76              public void process(PDFOperator operator, List arguments) {
77              }
78          };
79          registerOperatorProcessor("b", ignore);
80          registerOperatorProcessor("B", ignore);
81          registerOperatorProcessor("b*", ignore);
82          registerOperatorProcessor("B*", ignore);
83          registerOperatorProcessor("BDC", ignore);
84          registerOperatorProcessor("BI", ignore);
85          registerOperatorProcessor("BMC", ignore);
86          registerOperatorProcessor("b", ignore);
87          registerOperatorProcessor("BX", ignore);
88          registerOperatorProcessor("c", ignore);
89          registerOperatorProcessor("CS", ignore);
90          registerOperatorProcessor("cs", ignore);
91          registerOperatorProcessor("d", ignore);
92          registerOperatorProcessor("d0", ignore);
93          registerOperatorProcessor("d1", ignore);
94          registerOperatorProcessor("DP", ignore);
95          registerOperatorProcessor("El", ignore);
96          registerOperatorProcessor("EMC", ignore);
97          registerOperatorProcessor("EX", ignore);
98          registerOperatorProcessor("f", ignore);
99          registerOperatorProcessor("F", ignore);
100         registerOperatorProcessor("f*", ignore);
101         registerOperatorProcessor("G", ignore);
102         registerOperatorProcessor("g", ignore);
103         registerOperatorProcessor("h", ignore);
104         registerOperatorProcessor("i", ignore);
105         registerOperatorProcessor("ID", ignore);
106         registerOperatorProcessor("j", ignore);
107         registerOperatorProcessor("J", ignore);
108         registerOperatorProcessor("K", ignore);
109         registerOperatorProcessor("k", ignore);
110         registerOperatorProcessor("l", ignore);
111         registerOperatorProcessor("m", ignore);
112         registerOperatorProcessor("M", ignore);
113         registerOperatorProcessor("MP", ignore);
114         registerOperatorProcessor("n", ignore);
115         registerOperatorProcessor("re", ignore);
116         registerOperatorProcessor("RG", ignore);
117         registerOperatorProcessor("rg", ignore);
118         registerOperatorProcessor("ri", ignore);
119         registerOperatorProcessor("s", ignore);
120         registerOperatorProcessor("S", ignore);
121         registerOperatorProcessor("SC", ignore);
122         registerOperatorProcessor("sc", ignore);
123         registerOperatorProcessor("SCN", ignore);
124         registerOperatorProcessor("scn", ignore);
125         registerOperatorProcessor("sh", ignore);
126         registerOperatorProcessor("v", ignore);
127         registerOperatorProcessor("W", ignore);
128         registerOperatorProcessor("W*", ignore);
129         registerOperatorProcessor("y", ignore);
130     }
131 
132     @Override
133     protected void startDocument(PDDocument pdf) throws IOException {
134         try {
135             handler.startDocument();
136         } catch (SAXException e) {
137             throw new IOExceptionWithCause("Unable to start a document", e);
138         }
139     }
140 
141     @Override
142     protected void endDocument(PDDocument pdf) throws IOException {
143         try {
144             handler.endDocument();
145         } catch (SAXException e) {
146             throw new IOExceptionWithCause("Unable to end a document", e);
147         }
148     }
149 
150     @Override
151     protected void startPage(PDPage page) throws IOException {
152         try {
153             handler.startElement("div", "class", "page");
154             handler.startElement("p");
155         } catch (SAXException e) {
156             throw new IOExceptionWithCause("Unable to start a page", e);
157         }
158     }
159 
160     @Override
161     protected void endPage(PDPage page) throws IOException {
162         try {
163             handler.endElement("p");
164             handler.endElement("div");
165         } catch (SAXException e) {
166             throw new IOExceptionWithCause("Unable to end a page", e);
167         }
168     }
169 
170     @Override
171     protected void writeString(String text) throws IOException {
172         try {
173             handler.characters(text);
174         } catch (SAXException e) {
175             throw new IOExceptionWithCause(
176                     "Unable to write a string: " + text, e);
177         }
178     }
179 
180     @Override
181     protected void writeCharacters(TextPosition text) throws IOException {
182         try {
183             handler.characters(text.getCharacter());
184         } catch (SAXException e) {
185             throw new IOExceptionWithCause(
186                     "Unable to write a character: " + text.getCharacter(), e);
187         }
188     }
189 
190     // Two methods added to work around lack of support for processWordSeparator
191     // and processLineSeparator in PDFBox-0.7.3. This is fixed in CVS Head (PDFBox-0.7.4)
192     @Override
193     public String getWordSeparator()
194     {
195         try
196         {
197             handler.characters(" ");
198         } catch(SAXException e) {
199 
200         }
201         return super.getWordSeparator();    //To change body of overridden methods use File | Settings | File Templates.
202     }
203 
204     @Override
205     public String getLineSeparator()
206     {
207         try
208         {
209             handler.characters("\n");
210         } catch(SAXException e) {
211 
212         }
213         return super.getLineSeparator();
214     }
215 
216 //    protected void processLineSeparator(TextPosition p) throws IOException {
217 //        try {
218 //            handler.characters("\n");
219 //        } catch (SAXException e) {
220 //            throw new IOExceptionWithCause("Unable to write a newline", e);
221 //        }
222 //    }
223 //
224 //    protected void processWordSeparator(TextPosition a, TextPosition b)
225 //            throws IOException {
226 //        try {
227 //            handler.characters(" ");
228 //        } catch (SAXException e) {
229 //            throw new IOExceptionWithCause("Unable to write a space", e);
230 //        }
231 //    }
232 
233 }