View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.sax;
18  
19  import org.apache.tika.exception.TikaException;
20  import org.apache.tika.io.CountingInputStream;
21  import org.xml.sax.ContentHandler;
22  import org.xml.sax.SAXException;
23  
24  /**
25   * Content handler decorator that attempts to prevent denial of service
26   * attacks against Tika parsers.
27   * <p>
28   * Currently this class simply compares the number of output characters
29   * to to the number of input bytes, and throws an exception if the output
30   * is truly excessive when compared to the input. This is a strong indication
31   * of a zip bomb.
32   *
33   * @since Apache Tika 0.4
34   * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
35   */
36  public class SecureContentHandler extends ContentHandlerDecorator {
37  
38      /**
39       * The input stream that Tika is parsing.
40       */
41      private final CountingInputStream stream;
42  
43      /**
44       * Number of output characters that Tika has produced so far.
45       */
46      private long characterCount = 0;
47  
48      /**
49       * Output threshold.
50       */
51      private long threshold = 1000000;
52  
53      /**
54       * Maximum compression ratio.
55       */
56      private long ratio = 100;
57  
58      /**
59       * Decorates the given content handler with zip bomb prevention based
60       * on the count of bytes read from the given counting input stream.
61       * The resulting decorator can be passed to a Tika parser along with
62       * the given counting input stream.
63       *
64       * @param handler the content handler to be decorated
65       * @param stream the input stream to be parsed, wrapped into
66       *        a {@link CountingInputStream} decorator
67       */
68      public SecureContentHandler(
69              ContentHandler handler, CountingInputStream stream) {
70          super(handler);
71          this.stream = stream;
72      }
73  
74      /**
75       * Returns the configured output threshold.
76       *
77       * @return output threshold
78       */
79      public long getOutputThreshold() {
80          return threshold;
81      }
82  
83  
84      /**
85       * Sets the threshold for output characters before the zip bomb prevention
86       * is activated. This avoids false positives in cases where an otherwise
87       * normal document for some reason starts with a highly compressible
88       * sequence of bytes.
89       *
90       * @param threshold new output threshold
91       */
92      public void setOutputThreshold(long threshold) {
93          this.threshold = threshold;
94      }
95  
96  
97      /**
98       * Returns the maximum compression ratio.
99       *
100      * @return maximum compression ratio
101      */
102     public long getMaximumCompressionRatio() {
103         return ratio;
104     }
105 
106 
107     /**
108      * Sets the ratio between output characters and input bytes. If this
109      * ratio is exceeded (after the output threshold has been reached) then
110      * an exception gets thrown.
111      *
112      * @param ratio new maximum compression ratio
113      */
114     public void setMaximumCompressionRatio(long ratio) {
115         this.ratio = ratio;
116     }
117 
118     /**
119      * Converts the given {@link SAXException} to a corresponding
120      * {@link TikaException} if it's caused by this instance detecting
121      * a zip bomb.
122      *
123      * @param e SAX exception
124      * @throws TikaException zip bomb exception
125      */
126     public void throwIfCauseOf(SAXException e) throws TikaException {
127         if (e instanceof SecureSAXException
128                 && ((SecureSAXException) e).isCausedBy(this)) {
129             throw new TikaException("Zip bomb detected!", e);
130         }
131     }
132 
133     /**
134      * Records the given number of output characters (or more accurately
135      * UTF-16 code units). Throws an exception if the recorded number of
136      * characters highly exceeds the number of input bytes read.
137      *
138      * @param length number of new output characters produced
139      * @throws SAXException if a zip bomb is detected
140      */
141     private void advance(int length) throws SAXException {
142         characterCount += length;
143         if (characterCount > threshold
144                 && characterCount > stream.getByteCount() * ratio) {
145             throw new SecureSAXException();
146         }
147     }
148 
149     @Override
150     public void characters(char[] ch, int start, int length)
151             throws SAXException {
152         advance(length);
153         super.characters(ch, start, length);
154     }
155 
156     @Override
157     public void ignorableWhitespace(char[] ch, int start, int length)
158             throws SAXException {
159         advance(length);
160         super.ignorableWhitespace(ch, start, length);
161     }
162 
163     /**
164      * Private exception class used to indicate a suspected zip bomb.
165      *
166      * @see SecureContentHandler#throwIfCauseOf(SAXException)
167      */
168     private class SecureSAXException extends SAXException {
169 
170         public SecureSAXException() {
171             super("Suspected zip bomb: "
172                     + stream.getByteCount() + " input bytes produced "
173                     + characterCount + " output characters");
174         }
175 
176         public boolean isCausedBy(SecureContentHandler handler) {
177             return SecureContentHandler.this == handler;
178         }
179 
180     }
181 
182 }