1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.pdf;
18
19 import java.io.IOException;
20 import java.util.List;
21
22 import org.apache.pdfbox.pdmodel.PDDocument;
23 import org.apache.pdfbox.pdmodel.PDPage;
24 import org.apache.pdfbox.util.PDFOperator;
25 import org.apache.pdfbox.util.PDFTextStripper;
26 import org.apache.pdfbox.util.TextPosition;
27 import org.apache.pdfbox.util.operator.OperatorProcessor;
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.IOExceptionWithCause;
30 import org.apache.tika.metadata.Metadata;
31 import org.apache.tika.sax.XHTMLContentHandler;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.SAXException;
34
35
36
37
38
39
40 class PDF2XHTML extends PDFTextStripper {
41
42
43
44
45
46
47
48
49
50
51
52 public static void process(
53 PDDocument document, ContentHandler handler, Metadata metadata)
54 throws SAXException, TikaException {
55 try {
56 new PDF2XHTML(handler, metadata).getText(document);
57 } catch (IOException e) {
58 if (e.getCause() instanceof SAXException) {
59 throw (SAXException) e.getCause();
60 } else {
61 throw new TikaException("Unable to extract PDF content", e);
62 }
63 }
64 }
65
66 private final XHTMLContentHandler handler;
67
68 private PDF2XHTML(ContentHandler handler, Metadata metadata)
69 throws IOException {
70 this.handler = new XHTMLContentHandler(handler, metadata);
71
72
73
74 OperatorProcessor ignore = new OperatorProcessor() {
75 @Override @SuppressWarnings("unchecked")
76 public void process(PDFOperator operator, List arguments) {
77 }
78 };
79 registerOperatorProcessor("b", ignore);
80 registerOperatorProcessor("B", ignore);
81 registerOperatorProcessor("b*", ignore);
82 registerOperatorProcessor("B*", ignore);
83 registerOperatorProcessor("BDC", ignore);
84 registerOperatorProcessor("BI", ignore);
85 registerOperatorProcessor("BMC", ignore);
86 registerOperatorProcessor("b", ignore);
87 registerOperatorProcessor("BX", ignore);
88 registerOperatorProcessor("c", ignore);
89 registerOperatorProcessor("CS", ignore);
90 registerOperatorProcessor("cs", ignore);
91 registerOperatorProcessor("d", ignore);
92 registerOperatorProcessor("d0", ignore);
93 registerOperatorProcessor("d1", ignore);
94 registerOperatorProcessor("DP", ignore);
95 registerOperatorProcessor("El", ignore);
96 registerOperatorProcessor("EMC", ignore);
97 registerOperatorProcessor("EX", ignore);
98 registerOperatorProcessor("f", ignore);
99 registerOperatorProcessor("F", ignore);
100 registerOperatorProcessor("f*", ignore);
101 registerOperatorProcessor("G", ignore);
102 registerOperatorProcessor("g", ignore);
103 registerOperatorProcessor("h", ignore);
104 registerOperatorProcessor("i", ignore);
105 registerOperatorProcessor("ID", ignore);
106 registerOperatorProcessor("j", ignore);
107 registerOperatorProcessor("J", ignore);
108 registerOperatorProcessor("K", ignore);
109 registerOperatorProcessor("k", ignore);
110 registerOperatorProcessor("l", ignore);
111 registerOperatorProcessor("m", ignore);
112 registerOperatorProcessor("M", ignore);
113 registerOperatorProcessor("MP", ignore);
114 registerOperatorProcessor("n", ignore);
115 registerOperatorProcessor("re", ignore);
116 registerOperatorProcessor("RG", ignore);
117 registerOperatorProcessor("rg", ignore);
118 registerOperatorProcessor("ri", ignore);
119 registerOperatorProcessor("s", ignore);
120 registerOperatorProcessor("S", ignore);
121 registerOperatorProcessor("SC", ignore);
122 registerOperatorProcessor("sc", ignore);
123 registerOperatorProcessor("SCN", ignore);
124 registerOperatorProcessor("scn", ignore);
125 registerOperatorProcessor("sh", ignore);
126 registerOperatorProcessor("v", ignore);
127 registerOperatorProcessor("W", ignore);
128 registerOperatorProcessor("W*", ignore);
129 registerOperatorProcessor("y", ignore);
130 }
131
132 @Override
133 protected void startDocument(PDDocument pdf) throws IOException {
134 try {
135 handler.startDocument();
136 } catch (SAXException e) {
137 throw new IOExceptionWithCause("Unable to start a document", e);
138 }
139 }
140
141 @Override
142 protected void endDocument(PDDocument pdf) throws IOException {
143 try {
144 handler.endDocument();
145 } catch (SAXException e) {
146 throw new IOExceptionWithCause("Unable to end a document", e);
147 }
148 }
149
150 @Override
151 protected void startPage(PDPage page) throws IOException {
152 try {
153 handler.startElement("div", "class", "page");
154 handler.startElement("p");
155 } catch (SAXException e) {
156 throw new IOExceptionWithCause("Unable to start a page", e);
157 }
158 }
159
160 @Override
161 protected void endPage(PDPage page) throws IOException {
162 try {
163 handler.endElement("p");
164 handler.endElement("div");
165 } catch (SAXException e) {
166 throw new IOExceptionWithCause("Unable to end a page", e);
167 }
168 }
169
170 @Override
171 protected void writeString(String text) throws IOException {
172 try {
173 handler.characters(text);
174 } catch (SAXException e) {
175 throw new IOExceptionWithCause(
176 "Unable to write a string: " + text, e);
177 }
178 }
179
180 @Override
181 protected void writeCharacters(TextPosition text) throws IOException {
182 try {
183 handler.characters(text.getCharacter());
184 } catch (SAXException e) {
185 throw new IOExceptionWithCause(
186 "Unable to write a character: " + text.getCharacter(), e);
187 }
188 }
189
190
191
192 @Override
193 public String getWordSeparator()
194 {
195 try
196 {
197 handler.characters(" ");
198 } catch(SAXException e) {
199
200 }
201 return super.getWordSeparator();
202 }
203
204 @Override
205 public String getLineSeparator()
206 {
207 try
208 {
209 handler.characters("\n");
210 } catch(SAXException e) {
211
212 }
213 return super.getLineSeparator();
214 }
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233 }