1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.parser; 18 19 import java.io.IOException; 20 import java.io.InputStream; 21 import java.util.Set; 22 23 import org.apache.tika.exception.TikaException; 24 import org.apache.tika.metadata.Metadata; 25 import org.apache.tika.mime.MediaType; 26 import org.xml.sax.ContentHandler; 27 import org.xml.sax.SAXException; 28 29 /** 30 * Tika parser interface. 31 */ 32 public interface Parser { 33 34 /** 35 * Returns the set of media types supported by this parser when used 36 * with the given parse context. 37 * 38 * @since Apache Tika 0.7 39 * @param context parse context 40 * @return immutable set of media types 41 */ 42 Set<MediaType> getSupportedTypes(ParseContext context); 43 44 /** 45 * Parses a document stream into a sequence of XHTML SAX events. 46 * Fills in related document metadata in the given metadata object. 47 * <p> 48 * The given document stream is consumed but not closed by this method. 49 * The responsibility to close the stream remains on the caller. 50 * <p> 51 * Information about the parsing context can be passed in the context 52 * parameter. See the parser implementations for the kinds of context 53 * information they expect. 54 * 55 * @since Apache Tika 0.5 56 * @param stream the document stream (input) 57 * @param handler handler for the XHTML SAX events (output) 58 * @param metadata document metadata (input and output) 59 * @param context parse context 60 * @throws IOException if the document stream could not be read 61 * @throws SAXException if the SAX events could not be processed 62 * @throws TikaException if the document could not be parsed 63 */ 64 void parse( 65 InputStream stream, ContentHandler handler, 66 Metadata metadata, ParseContext context) 67 throws IOException, SAXException, TikaException; 68 69 /** 70 * The parse() method from Tika 0.4 and earlier. Please use the 71 * {@link #parse(InputStream, ContentHandler, Metadata, ParseContext)} 72 * method instead in new code. Calls to this backwards compatibility 73 * method are forwarded to the new parse() method with an empty parse 74 * context. 75 * 76 * @deprecated This method will be removed in Apache Tika 1.0. 77 */ 78 void parse(InputStream stream, ContentHandler handler, Metadata metadata) 79 throws IOException, SAXException, TikaException; 80 81 }