View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.io.InputStreamReader;
22  import java.io.OutputStream;
23  import java.io.Reader;
24  import java.util.Collections;
25  import java.util.HashSet;
26  import java.util.Set;
27  
28  import org.apache.tika.exception.TikaException;
29  import org.apache.tika.io.IOUtils;
30  import org.apache.tika.io.NullOutputStream;
31  import org.apache.tika.metadata.Metadata;
32  import org.apache.tika.mime.MediaType;
33  import org.apache.tika.sax.XHTMLContentHandler;
34  import org.xml.sax.ContentHandler;
35  import org.xml.sax.SAXException;
36  
37  /**
38   * Parser that uses an external program (like catdoc or pdf2txt) to extract
39   * text content from a given document.
40   */
41  public class ExternalParser implements Parser {
42  
43      /**
44       * Media types supported by the external program.
45       */
46      private Set<MediaType> supportedTypes = Collections.emptySet();
47  
48      /**
49       * The external command to invoke.
50       * @see Runtime#exec(String)
51       */
52      private String command = "cat";
53  
54      public Set<MediaType> getSupportedTypes(ParseContext context) {
55          return getSupportedTypes();
56      }
57  
58      public Set<MediaType> getSupportedTypes() {
59          return supportedTypes;
60      }
61  
62      public void setSupportedTypes(Set<MediaType> supportedTypes) {
63          this.supportedTypes =
64              Collections.unmodifiableSet(new HashSet<MediaType>(supportedTypes));
65      }
66  
67      public String getCommand() {
68          return command;
69      }
70  
71      public void setCommand(String command) {
72          this.command = command;
73      }
74  
75      /**
76       * Executes the configured external command and passes the given document
77       * stream as a simple XHTML document to the given SAX content handler.
78       * No metadata is extracted.
79       */
80      public void parse(
81              final InputStream stream, ContentHandler handler,
82              Metadata metadata, ParseContext context)
83              throws IOException, SAXException, TikaException {
84          XHTMLContentHandler xhtml =
85              new XHTMLContentHandler(handler, metadata);
86  
87          Process process = Runtime.getRuntime().exec(command);
88          try {
89              sendInput(process, stream);
90              ignoreError(process);
91              extractOutput(process, xhtml);
92          } finally {
93              try {
94                  process.waitFor();
95              } catch (InterruptedException ignore) {
96              }
97          }
98      }
99  
100     /**
101      * @deprecated This method will be removed in Apache Tika 1.0.
102      */
103     public void parse(
104             InputStream stream, ContentHandler handler, Metadata metadata)
105             throws IOException, SAXException, TikaException {
106         parse(stream, handler, metadata, new ParseContext());
107     }
108 
109     /**
110      * Starts a thread that extracts the contents of the standard output
111      * stream of the given process to the given XHTML content handler.
112      * The standard output stream is closed once fully processed.
113      *
114      * @param process process
115      * @param xhtml XHTML content handler
116      * @throws SAXException if the XHTML SAX events could not be handled
117      * @throws IOException if an input error occurred
118      */
119     private void extractOutput(Process process, XHTMLContentHandler xhtml)
120             throws SAXException, IOException {
121         Reader reader = new InputStreamReader(process.getInputStream());
122         try {
123             xhtml.startDocument();
124             xhtml.startElement("p");
125             char[] buffer = new char[1024];
126             for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
127                 xhtml.characters(buffer, 0, n);
128             }
129             xhtml.endElement("p");
130             xhtml.endDocument();
131         } finally {
132             reader.close();
133         }
134     }
135 
136     /**
137      * Starts a thread that sends the contents of the given input stream
138      * to the standard input stream of the given process. Potential
139      * exceptions are ignored, and the standard input stream is closed
140      * once fully processed. Note that the given input stream is <em>not</em>
141      * closed by this method.
142      *
143      * @param process process
144      * @param stream input stream
145      */
146     private void sendInput(final Process process, final InputStream stream) {
147         new Thread() {
148             public void run() {
149                 OutputStream stdin = process.getOutputStream();
150                 try {
151                     IOUtils.copy(stream, stdin);
152                 } catch (IOException e) {
153                 } finally {
154                     IOUtils.closeQuietly(stdin);
155                 }
156             }
157         }.start();
158     }
159 
160     /**
161      * Starts a thread that reads and discards the contents of the
162      * standard error stream of the given process. Potential exceptions
163      * are ignored, and the error stream is closed once fully processed.
164      *
165      * @param process process
166      */
167     private void ignoreError(final Process process) {
168         new Thread() {
169             public void run() {
170                 InputStream error = process.getErrorStream();
171                 try {
172                     IOUtils.copy(error, new NullOutputStream());
173                 } catch (IOException e) {
174                 } finally {
175                     IOUtils.closeQuietly(error);
176                 }
177             }
178         }.start();
179     }
180 
181 }