View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser;
18  
19  import java.io.BufferedReader;
20  import java.io.File;
21  import java.io.FileInputStream;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.PipedReader;
26  import java.io.PipedWriter;
27  import java.io.Reader;
28  import java.io.Writer;
29  import java.util.concurrent.Executor;
30  
31  import org.apache.tika.metadata.Metadata;
32  import org.apache.tika.sax.BodyContentHandler;
33  import org.xml.sax.ContentHandler;
34  
35  /**
36   * Reader for the text content from a given binary stream. This class
37   * uses a background parsing task with a {@link Parser}
38   * ({@link AutoDetectParser} by default) to parse the text content from
39   * a given input stream. The {@link BodyContentHandler} class and a pipe
40   * is used to convert the push-based SAX event stream to the pull-based
41   * character stream defined by the {@link Reader} interface.
42   *
43   * @since Apache Tika 0.2
44   */
45  public class ParsingReader extends Reader {
46  
47      /**
48       * Parser instance used for parsing the given binary stream.
49       */
50      private final Parser parser;
51  
52      /**
53       * Buffered read end of the pipe.
54       */
55      private final Reader reader;
56  
57      /**
58       * Write end of the pipe.
59       */
60      private final Writer writer;
61  
62      /**
63       * The binary stream being parsed.
64       */
65      private final InputStream stream;
66  
67      /**
68       * Metadata associated with the document being parsed.
69       */
70      private final Metadata metadata;
71  
72      /**
73       * The parse context.
74       */
75      private final ParseContext context;
76  
77      /**
78       * An exception (if any) thrown by the parsing thread.
79       */
80      private transient Throwable throwable;
81  
82      /**
83       * Utility method that returns a {@link Metadata} instance
84       * for a document with the given name.
85       *
86       * @param name resource name (or <code>null</code>)
87       * @return metadata instance
88       */
89      private static Metadata getMetadata(String name) {
90          Metadata metadata = new Metadata();
91          if (name != null && name.length() > 0) {
92              metadata.set(Metadata.RESOURCE_NAME_KEY, name);
93          }
94          return metadata;
95      }
96  
97      /**
98       * Creates a reader for the text content of the given binary stream.
99       *
100      * @param stream binary stream
101      * @throws IOException if the document can not be parsed
102      */
103     public ParsingReader(InputStream stream) throws IOException {
104         this(new AutoDetectParser(), stream, new Metadata());
105     }
106 
107     /**
108      * Creates a reader for the text content of the given binary stream
109      * with the given name.
110      *
111      * @param stream binary stream
112      * @param name document name
113      * @throws IOException if the document can not be parsed
114      */
115     public ParsingReader(InputStream stream, String name) throws IOException {
116         this(new AutoDetectParser(), stream, getMetadata(name));
117     }
118 
119     /**
120      * Creates a reader for the text content of the given file.
121      *
122      * @param file file
123      * @throws FileNotFoundException if the given file does not exist
124      * @throws IOException if the document can not be parsed
125      */
126     public ParsingReader(File file) throws FileNotFoundException, IOException {
127         this(new FileInputStream(file), file.getName());
128     }
129 
130     /**
131      * Creates a reader for the text content of the given binary stream
132      * with the given document metadata. The given parser is used for
133      * parsing. A new background thread is started for the parsing task.
134      *
135      * @param parser parser instance
136      * @param stream binary stream
137      * @param metadata document metadata
138      * @throws IOException if the document can not be parsed
139      */
140     public ParsingReader(
141             Parser parser, InputStream stream, final Metadata metadata,
142             ParseContext context) throws IOException {
143         this(parser, stream, metadata, context, new Executor() {
144             public void execute(Runnable command) {
145                 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
146                 if (name != null) {
147                     name = "Apache Tika: " + name;
148                 } else {
149                     name = "Apache Tika";
150                 }
151                 Thread thread = new Thread(command, name);
152                 thread.setDaemon(true);
153                 thread.start();
154             }
155         });
156     }
157 
158     /**
159      * Creates a reader for the text content of the given binary stream
160      * with the given document metadata. The given parser is used for the
161      * parsing task that is run with the given executor. The given executor
162      * <em>must</em> run the parsing task asynchronously in a separate thread,
163      * since the current thread must return to the caller that can then
164      * consume the parsed text through the {@link Reader} interface.
165      *
166      * @param parser parser instance
167      * @param stream binary stream
168      * @param metadata document metadata
169      * @param context parsing context
170      * @param executor executor for the parsing task
171      * @throws IOException if the document can not be parsed
172      * @since Apache Tika 0.4
173      */
174     public ParsingReader(
175             Parser parser, InputStream stream, Metadata metadata,
176             ParseContext context, Executor executor) throws IOException {
177         this.parser = parser;
178         PipedReader pipedReader = new PipedReader();
179         this.reader = new BufferedReader(pipedReader);
180         try {
181             this.writer = new PipedWriter(pipedReader);
182         } catch (IOException e) {
183             throw new IllegalStateException(e); // Should never happen
184         }
185         this.stream = stream;
186         this.metadata = metadata;
187         this.context = context;
188 
189         executor.execute(new ParsingTask());
190 
191         // TIKA-203: Buffer first character to force metadata extraction
192         reader.mark(1);
193         reader.read();
194         reader.reset();
195     }
196 
197     /**
198      * @deprecated This method will be removed in Apache Tika 1.0
199      * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
200      */
201     public ParsingReader(Parser parser, InputStream stream, Metadata metadata)
202             throws IOException {
203         this(parser, stream, metadata, new ParseContext());
204         context.set(Parser.class, parser);
205     }
206 
207     /**
208      * @deprecated This method will be removed in Apache Tika 1.0
209      * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a>
210      */
211     public ParsingReader(
212             Parser parser, InputStream stream, Metadata metadata,
213             Executor executor) throws IOException {
214         this(parser, stream, metadata, new ParseContext(), executor);
215         context.set(Parser.class, parser);
216     }
217 
218     /**
219      * The background parsing task.
220      */
221     private class ParsingTask implements Runnable {
222 
223         /**
224          * Parses the given binary stream and writes the text content
225          * to the write end of the pipe. Potential exceptions (including
226          * the one caused if the read end is closed unexpectedly) are
227          * stored before the input stream is closed and processing is stopped.
228          */
229         public void run() {
230             try {
231                 ContentHandler handler = new BodyContentHandler(writer);
232                 parser.parse(stream, handler, metadata, context);
233             } catch (Throwable t) {
234                 throwable = t;
235             }
236 
237             try {
238                 stream.close();
239             } catch (Throwable t) {
240                 if (throwable == null) {
241                     throwable = t;
242                 }
243             }
244 
245             try {
246                 writer.close();
247             } catch (Throwable t) {
248                 if (throwable == null) {
249                     throwable = t;
250                 }
251             }
252         }
253 
254     }
255 
256     /**
257      * Reads parsed text from the pipe connected to the parsing thread.
258      * Fails if the parsing thread has thrown an exception.
259      *
260      * @param cbuf character buffer
261      * @param off start offset within the buffer
262      * @param len maximum number of characters to read
263      * @throws IOException if the parsing thread has failed or
264      *                     if for some reason the pipe does not work properly
265      */
266     @Override
267     public int read(char[] cbuf, int off, int len) throws IOException {
268         if (throwable instanceof IOException) {
269             throw (IOException) throwable;
270         } else if (throwable != null) {
271             IOException exception = new IOException("");
272             exception.initCause(throwable);
273             throw exception;
274         }
275         return reader.read(cbuf, off, len);
276     }
277 
278     /**
279      * Closes the read end of the pipe. If the parsing thread is still
280      * running, next write to the pipe will fail and cause the thread
281      * to stop. Thus there is no need to explicitly terminate the thread.
282      *
283      * @throws IOException if the pipe can not be closed
284      */
285     @Override
286     public void close() throws IOException {
287         reader.close();
288     }
289 
290 }