1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.sax; 18 19 import org.apache.tika.exception.TikaException; 20 import org.apache.tika.io.CountingInputStream; 21 import org.xml.sax.ContentHandler; 22 import org.xml.sax.SAXException; 23 24 /** 25 * Content handler decorator that attempts to prevent denial of service 26 * attacks against Tika parsers. 27 * <p> 28 * Currently this class simply compares the number of output characters 29 * to to the number of input bytes, and throws an exception if the output 30 * is truly excessive when compared to the input. This is a strong indication 31 * of a zip bomb. 32 * 33 * @since Apache Tika 0.4 34 * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a> 35 */ 36 public class SecureContentHandler extends ContentHandlerDecorator { 37 38 /** 39 * The input stream that Tika is parsing. 40 */ 41 private final CountingInputStream stream; 42 43 /** 44 * Number of output characters that Tika has produced so far. 45 */ 46 private long characterCount = 0; 47 48 /** 49 * Output threshold. 50 */ 51 private long threshold = 1000000; 52 53 /** 54 * Maximum compression ratio. 55 */ 56 private long ratio = 100; 57 58 /** 59 * Decorates the given content handler with zip bomb prevention based 60 * on the count of bytes read from the given counting input stream. 61 * The resulting decorator can be passed to a Tika parser along with 62 * the given counting input stream. 63 * 64 * @param handler the content handler to be decorated 65 * @param stream the input stream to be parsed, wrapped into 66 * a {@link CountingInputStream} decorator 67 */ 68 public SecureContentHandler( 69 ContentHandler handler, CountingInputStream stream) { 70 super(handler); 71 this.stream = stream; 72 } 73 74 /** 75 * Returns the configured output threshold. 76 * 77 * @return output threshold 78 */ 79 public long getOutputThreshold() { 80 return threshold; 81 } 82 83 84 /** 85 * Sets the threshold for output characters before the zip bomb prevention 86 * is activated. This avoids false positives in cases where an otherwise 87 * normal document for some reason starts with a highly compressible 88 * sequence of bytes. 89 * 90 * @param threshold new output threshold 91 */ 92 public void setOutputThreshold(long threshold) { 93 this.threshold = threshold; 94 } 95 96 97 /** 98 * Returns the maximum compression ratio. 99 * 100 * @return maximum compression ratio 101 */ 102 public long getMaximumCompressionRatio() { 103 return ratio; 104 } 105 106 107 /** 108 * Sets the ratio between output characters and input bytes. If this 109 * ratio is exceeded (after the output threshold has been reached) then 110 * an exception gets thrown. 111 * 112 * @param ratio new maximum compression ratio 113 */ 114 public void setMaximumCompressionRatio(long ratio) { 115 this.ratio = ratio; 116 } 117 118 /** 119 * Converts the given {@link SAXException} to a corresponding 120 * {@link TikaException} if it's caused by this instance detecting 121 * a zip bomb. 122 * 123 * @param e SAX exception 124 * @throws TikaException zip bomb exception 125 */ 126 public void throwIfCauseOf(SAXException e) throws TikaException { 127 if (e instanceof SecureSAXException 128 && ((SecureSAXException) e).isCausedBy(this)) { 129 throw new TikaException("Zip bomb detected!", e); 130 } 131 } 132 133 /** 134 * Records the given number of output characters (or more accurately 135 * UTF-16 code units). Throws an exception if the recorded number of 136 * characters highly exceeds the number of input bytes read. 137 * 138 * @param length number of new output characters produced 139 * @throws SAXException if a zip bomb is detected 140 */ 141 private void advance(int length) throws SAXException { 142 characterCount += length; 143 if (characterCount > threshold 144 && characterCount > stream.getByteCount() * ratio) { 145 throw new SecureSAXException(); 146 } 147 } 148 149 @Override 150 public void characters(char[] ch, int start, int length) 151 throws SAXException { 152 advance(length); 153 super.characters(ch, start, length); 154 } 155 156 @Override 157 public void ignorableWhitespace(char[] ch, int start, int length) 158 throws SAXException { 159 advance(length); 160 super.ignorableWhitespace(ch, start, length); 161 } 162 163 /** 164 * Private exception class used to indicate a suspected zip bomb. 165 * 166 * @see SecureContentHandler#throwIfCauseOf(SAXException) 167 */ 168 private class SecureSAXException extends SAXException { 169 170 public SecureSAXException() { 171 super("Suspected zip bomb: " 172 + stream.getByteCount() + " input bytes produced " 173 + characterCount + " output characters"); 174 } 175 176 public boolean isCausedBy(SecureContentHandler handler) { 177 return SecureContentHandler.this == handler; 178 } 179 180 } 181 182 }