View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser;
18  
19  import java.io.BufferedInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  
23  import org.apache.tika.config.TikaConfig;
24  import org.apache.tika.detect.Detector;
25  import org.apache.tika.exception.TikaException;
26  import org.apache.tika.io.CountingInputStream;
27  import org.apache.tika.metadata.Metadata;
28  import org.apache.tika.mime.MediaType;
29  import org.apache.tika.sax.SecureContentHandler;
30  import org.xml.sax.ContentHandler;
31  import org.xml.sax.SAXException;
32  
33  public class AutoDetectParser extends CompositeParser {
34  
35      /**
36       * The type detector used by this parser to auto-detect the type
37       * of a document.
38       */
39      private Detector detector; // always set in the constructor
40  
41      /**
42       * Creates an auto-detecting parser instance using the default Tika
43       * configuration.
44       */
45      public AutoDetectParser() {
46          this(TikaConfig.getDefaultConfig());
47      }
48  
49      public AutoDetectParser(TikaConfig config) {
50          setConfig(config);
51      }
52  
53      public void setConfig(TikaConfig config) {
54          setParsers(config.getParsers());
55          setDetector(config.getMimeRepository());
56      }
57  
58      /**
59       * Returns the type detector used by this parser to auto-detect the type
60       * of a document.
61       *
62       * @return type detector
63       * @since Apache Tika 0.4
64       */
65      public Detector getDetector() {
66          return detector;
67      }
68  
69      /**
70       * Sets the type detector used by this parser to auto-detect the type
71       * of a document. Note that calling the {@link #setConfig(TikaConfig)}
72       * method will override the type detector setting with the type settings
73       * included in the given configuration.
74       *
75       * @param detector type detector
76       * @since Apache Tika 0.4
77       */
78      public void setDetector(Detector detector) {
79          this.detector = detector;
80      }
81  
82      public void parse(
83              InputStream stream, ContentHandler handler,
84              Metadata metadata, ParseContext context)
85              throws IOException, SAXException, TikaException {
86          // We need (reliable!) mark support for type detection before parsing
87          stream = new BufferedInputStream(stream);
88  
89          // Automatically detect the MIME type of the document 
90          MediaType type = detector.detect(stream, metadata);
91          metadata.set(Metadata.CONTENT_TYPE, type.toString());
92  
93          // TIKA-216: Zip bomb prevention
94          CountingInputStream count = new CountingInputStream(stream);
95          SecureContentHandler secure = new SecureContentHandler(handler, count);
96  
97          // Parse the document
98          try {
99              super.parse(count, secure, metadata, context);
100         } catch (SAXException e) {
101             // Convert zip bomb exceptions to TikaExceptions
102             secure.throwIfCauseOf(e);
103             throw e;
104         }
105     }
106 
107     public void parse(
108             InputStream stream, ContentHandler handler, Metadata metadata)
109             throws IOException, SAXException, TikaException {
110         ParseContext context = new ParseContext();
111         context.set(Parser.class, this);
112         parse(stream, handler, metadata, context);
113     }
114 
115 }