1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.InputStreamReader;
22 import java.io.OutputStream;
23 import java.io.Reader;
24 import java.util.Collections;
25 import java.util.HashSet;
26 import java.util.Set;
27
28 import org.apache.tika.exception.TikaException;
29 import org.apache.tika.io.IOUtils;
30 import org.apache.tika.io.NullOutputStream;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.mime.MediaType;
33 import org.apache.tika.sax.XHTMLContentHandler;
34 import org.xml.sax.ContentHandler;
35 import org.xml.sax.SAXException;
36
37
38
39
40
41 public class ExternalParser implements Parser {
42
43
44
45
46 private Set<MediaType> supportedTypes = Collections.emptySet();
47
48
49
50
51
52 private String command = "cat";
53
54 public Set<MediaType> getSupportedTypes(ParseContext context) {
55 return getSupportedTypes();
56 }
57
58 public Set<MediaType> getSupportedTypes() {
59 return supportedTypes;
60 }
61
62 public void setSupportedTypes(Set<MediaType> supportedTypes) {
63 this.supportedTypes =
64 Collections.unmodifiableSet(new HashSet<MediaType>(supportedTypes));
65 }
66
67 public String getCommand() {
68 return command;
69 }
70
71 public void setCommand(String command) {
72 this.command = command;
73 }
74
75
76
77
78
79
80 public void parse(
81 final InputStream stream, ContentHandler handler,
82 Metadata metadata, ParseContext context)
83 throws IOException, SAXException, TikaException {
84 XHTMLContentHandler xhtml =
85 new XHTMLContentHandler(handler, metadata);
86
87 Process process = Runtime.getRuntime().exec(command);
88 try {
89 sendInput(process, stream);
90 ignoreError(process);
91 extractOutput(process, xhtml);
92 } finally {
93 try {
94 process.waitFor();
95 } catch (InterruptedException ignore) {
96 }
97 }
98 }
99
100
101
102
103 public void parse(
104 InputStream stream, ContentHandler handler, Metadata metadata)
105 throws IOException, SAXException, TikaException {
106 parse(stream, handler, metadata, new ParseContext());
107 }
108
109
110
111
112
113
114
115
116
117
118
119 private void extractOutput(Process process, XHTMLContentHandler xhtml)
120 throws SAXException, IOException {
121 Reader reader = new InputStreamReader(process.getInputStream());
122 try {
123 xhtml.startDocument();
124 xhtml.startElement("p");
125 char[] buffer = new char[1024];
126 for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
127 xhtml.characters(buffer, 0, n);
128 }
129 xhtml.endElement("p");
130 xhtml.endDocument();
131 } finally {
132 reader.close();
133 }
134 }
135
136
137
138
139
140
141
142
143
144
145
146 private void sendInput(final Process process, final InputStream stream) {
147 new Thread() {
148 public void run() {
149 OutputStream stdin = process.getOutputStream();
150 try {
151 IOUtils.copy(stream, stdin);
152 } catch (IOException e) {
153 } finally {
154 IOUtils.closeQuietly(stdin);
155 }
156 }
157 }.start();
158 }
159
160
161
162
163
164
165
166
167 private void ignoreError(final Process process) {
168 new Thread() {
169 public void run() {
170 InputStream error = process.getErrorStream();
171 try {
172 IOUtils.copy(error, new NullOutputStream());
173 } catch (IOException e) {
174 } finally {
175 IOUtils.closeQuietly(error);
176 }
177 }
178 }.start();
179 }
180
181 }