1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.parser; 18 19 import java.io.BufferedReader; 20 import java.io.File; 21 import java.io.FileInputStream; 22 import java.io.FileNotFoundException; 23 import java.io.IOException; 24 import java.io.InputStream; 25 import java.io.PipedReader; 26 import java.io.PipedWriter; 27 import java.io.Reader; 28 import java.io.Writer; 29 import java.util.concurrent.Executor; 30 31 import org.apache.tika.metadata.Metadata; 32 import org.apache.tika.sax.BodyContentHandler; 33 import org.xml.sax.ContentHandler; 34 35 /** 36 * Reader for the text content from a given binary stream. This class 37 * uses a background parsing task with a {@link Parser} 38 * ({@link AutoDetectParser} by default) to parse the text content from 39 * a given input stream. The {@link BodyContentHandler} class and a pipe 40 * is used to convert the push-based SAX event stream to the pull-based 41 * character stream defined by the {@link Reader} interface. 42 * 43 * @since Apache Tika 0.2 44 */ 45 public class ParsingReader extends Reader { 46 47 /** 48 * Parser instance used for parsing the given binary stream. 49 */ 50 private final Parser parser; 51 52 /** 53 * Buffered read end of the pipe. 54 */ 55 private final Reader reader; 56 57 /** 58 * Write end of the pipe. 59 */ 60 private final Writer writer; 61 62 /** 63 * The binary stream being parsed. 64 */ 65 private final InputStream stream; 66 67 /** 68 * Metadata associated with the document being parsed. 69 */ 70 private final Metadata metadata; 71 72 /** 73 * The parse context. 74 */ 75 private final ParseContext context; 76 77 /** 78 * An exception (if any) thrown by the parsing thread. 79 */ 80 private transient Throwable throwable; 81 82 /** 83 * Utility method that returns a {@link Metadata} instance 84 * for a document with the given name. 85 * 86 * @param name resource name (or <code>null</code>) 87 * @return metadata instance 88 */ 89 private static Metadata getMetadata(String name) { 90 Metadata metadata = new Metadata(); 91 if (name != null && name.length() > 0) { 92 metadata.set(Metadata.RESOURCE_NAME_KEY, name); 93 } 94 return metadata; 95 } 96 97 /** 98 * Creates a reader for the text content of the given binary stream. 99 * 100 * @param stream binary stream 101 * @throws IOException if the document can not be parsed 102 */ 103 public ParsingReader(InputStream stream) throws IOException { 104 this(new AutoDetectParser(), stream, new Metadata()); 105 } 106 107 /** 108 * Creates a reader for the text content of the given binary stream 109 * with the given name. 110 * 111 * @param stream binary stream 112 * @param name document name 113 * @throws IOException if the document can not be parsed 114 */ 115 public ParsingReader(InputStream stream, String name) throws IOException { 116 this(new AutoDetectParser(), stream, getMetadata(name)); 117 } 118 119 /** 120 * Creates a reader for the text content of the given file. 121 * 122 * @param file file 123 * @throws FileNotFoundException if the given file does not exist 124 * @throws IOException if the document can not be parsed 125 */ 126 public ParsingReader(File file) throws FileNotFoundException, IOException { 127 this(new FileInputStream(file), file.getName()); 128 } 129 130 /** 131 * Creates a reader for the text content of the given binary stream 132 * with the given document metadata. The given parser is used for 133 * parsing. A new background thread is started for the parsing task. 134 * 135 * @param parser parser instance 136 * @param stream binary stream 137 * @param metadata document metadata 138 * @throws IOException if the document can not be parsed 139 */ 140 public ParsingReader( 141 Parser parser, InputStream stream, final Metadata metadata, 142 ParseContext context) throws IOException { 143 this(parser, stream, metadata, context, new Executor() { 144 public void execute(Runnable command) { 145 String name = metadata.get(Metadata.RESOURCE_NAME_KEY); 146 if (name != null) { 147 name = "Apache Tika: " + name; 148 } else { 149 name = "Apache Tika"; 150 } 151 Thread thread = new Thread(command, name); 152 thread.setDaemon(true); 153 thread.start(); 154 } 155 }); 156 } 157 158 /** 159 * Creates a reader for the text content of the given binary stream 160 * with the given document metadata. The given parser is used for the 161 * parsing task that is run with the given executor. The given executor 162 * <em>must</em> run the parsing task asynchronously in a separate thread, 163 * since the current thread must return to the caller that can then 164 * consume the parsed text through the {@link Reader} interface. 165 * 166 * @param parser parser instance 167 * @param stream binary stream 168 * @param metadata document metadata 169 * @param context parsing context 170 * @param executor executor for the parsing task 171 * @throws IOException if the document can not be parsed 172 * @since Apache Tika 0.4 173 */ 174 public ParsingReader( 175 Parser parser, InputStream stream, Metadata metadata, 176 ParseContext context, Executor executor) throws IOException { 177 this.parser = parser; 178 PipedReader pipedReader = new PipedReader(); 179 this.reader = new BufferedReader(pipedReader); 180 try { 181 this.writer = new PipedWriter(pipedReader); 182 } catch (IOException e) { 183 throw new IllegalStateException(e); // Should never happen 184 } 185 this.stream = stream; 186 this.metadata = metadata; 187 this.context = context; 188 189 executor.execute(new ParsingTask()); 190 191 // TIKA-203: Buffer first character to force metadata extraction 192 reader.mark(1); 193 reader.read(); 194 reader.reset(); 195 } 196 197 /** 198 * @deprecated This method will be removed in Apache Tika 1.0 199 * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> 200 */ 201 public ParsingReader(Parser parser, InputStream stream, Metadata metadata) 202 throws IOException { 203 this(parser, stream, metadata, new ParseContext()); 204 context.set(Parser.class, parser); 205 } 206 207 /** 208 * @deprecated This method will be removed in Apache Tika 1.0 209 * @see <a href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> 210 */ 211 public ParsingReader( 212 Parser parser, InputStream stream, Metadata metadata, 213 Executor executor) throws IOException { 214 this(parser, stream, metadata, new ParseContext(), executor); 215 context.set(Parser.class, parser); 216 } 217 218 /** 219 * The background parsing task. 220 */ 221 private class ParsingTask implements Runnable { 222 223 /** 224 * Parses the given binary stream and writes the text content 225 * to the write end of the pipe. Potential exceptions (including 226 * the one caused if the read end is closed unexpectedly) are 227 * stored before the input stream is closed and processing is stopped. 228 */ 229 public void run() { 230 try { 231 ContentHandler handler = new BodyContentHandler(writer); 232 parser.parse(stream, handler, metadata, context); 233 } catch (Throwable t) { 234 throwable = t; 235 } 236 237 try { 238 stream.close(); 239 } catch (Throwable t) { 240 if (throwable == null) { 241 throwable = t; 242 } 243 } 244 245 try { 246 writer.close(); 247 } catch (Throwable t) { 248 if (throwable == null) { 249 throwable = t; 250 } 251 } 252 } 253 254 } 255 256 /** 257 * Reads parsed text from the pipe connected to the parsing thread. 258 * Fails if the parsing thread has thrown an exception. 259 * 260 * @param cbuf character buffer 261 * @param off start offset within the buffer 262 * @param len maximum number of characters to read 263 * @throws IOException if the parsing thread has failed or 264 * if for some reason the pipe does not work properly 265 */ 266 @Override 267 public int read(char[] cbuf, int off, int len) throws IOException { 268 if (throwable instanceof IOException) { 269 throw (IOException) throwable; 270 } else if (throwable != null) { 271 IOException exception = new IOException(""); 272 exception.initCause(throwable); 273 throw exception; 274 } 275 return reader.read(cbuf, off, len); 276 } 277 278 /** 279 * Closes the read end of the pipe. If the parsing thread is still 280 * running, next write to the pipe will fail and cause the thread 281 * to stop. Thus there is no need to explicitly terminate the thread. 282 * 283 * @throws IOException if the pipe can not be closed 284 */ 285 @Override 286 public void close() throws IOException { 287 reader.close(); 288 } 289 290 }