1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.sax; 18 19 import org.xml.sax.ContentHandler; 20 import org.xml.sax.SAXException; 21 22 /** 23 * Content handler decorator that makes sure that the character events 24 * ({@link #characters(char[], int, int)} or 25 * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated 26 * content handler contain only valid XML characters. All invalid characters 27 * are replaced with spaces. 28 * <p> 29 * The XML standard defines the following Unicode character ranges as 30 * valid XML characters: 31 * <pre> 32 * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 33 * </pre> 34 * <p> 35 * Note that currently this class only detects those invalid characters whose 36 * UTF-16 representation fits a single char. Also, this class does not ensure 37 * that the UTF-16 encoding of incoming characters is correct. 38 */ 39 public class SafeContentHandler extends ContentHandlerDecorator { 40 41 /** 42 * Replacement for invalid characters. 43 */ 44 private static final char[] REPLACEMENT = new char[] { ' ' }; 45 46 /** 47 * Internal interface that allows both character and 48 * ignorable whitespace content to be filtered the same way. 49 */ 50 protected interface Output { 51 void write(char[] ch, int start, int length) throws SAXException; 52 } 53 54 /** 55 * Output through the {@link ContentHandler#characters(char[], int, int)} 56 * method of the decorated content handler. 57 */ 58 private final Output charactersOutput = new Output() { 59 public void write(char[] ch, int start, int length) 60 throws SAXException { 61 SafeContentHandler.super.characters(ch, start, length); 62 } 63 }; 64 65 /** 66 * Output through the 67 * {@link ContentHandler#ignorableWhitespace(char[], int, int)} 68 * method of the decorated content handler. 69 */ 70 private final Output ignorableWhitespaceOutput = new Output() { 71 public void write(char[] ch, int start, int length) 72 throws SAXException { 73 SafeContentHandler.super.ignorableWhitespace(ch, start, length); 74 } 75 }; 76 77 public SafeContentHandler(ContentHandler handler) { 78 super(handler); 79 } 80 81 /** 82 * Filters and outputs the contents of the given input buffer. Any 83 * invalid characters in the input buffer area handled by sending a 84 * replacement (a space character) to the given output. Any sequences 85 * of valid characters are passed as-is to the given output. 86 * 87 * @param ch input buffer 88 * @param start start offset within the buffer 89 * @param length number of characters to read from the buffer 90 * @param output output channel 91 * @throws SAXException if the filtered characters could not be written out 92 */ 93 private void filter(char[] ch, int start, int length, Output output) 94 throws SAXException { 95 int end = start + length; 96 97 for (int i = start; i < end; i++) { 98 if (isInvalid(ch[i])) { 99 // Output any preceding valid characters 100 if (i > start) { 101 output.write(ch, start, i - start); 102 } 103 104 // Output the replacement for this invalid character 105 writeReplacement(output); 106 107 // Continue with the rest of the array 108 start = i + 1; 109 } 110 } 111 112 // Output any remaining valid characters 113 output.write(ch, start, end - start); 114 } 115 116 /** 117 * Checks whether the given character (more accurately a UTF-16 code unit) 118 * is an invalid XML character and should be replaced for output. 119 * Subclasses can override this method to use an alternative definition 120 * of which characters should be replaced in the XML output. 121 * 122 * @param ch character 123 * @return <code>true</code> if the character should be replaced, 124 * <code>false</code> otherwise 125 */ 126 protected boolean isInvalid(char ch) { 127 // TODO: Correct handling of multi-word characters 128 if (ch < 0x20) { 129 return ch != 0x09 && ch != 0x0A && ch != 0x0D; 130 } else { 131 return ch >= 0xFFFE; 132 } 133 } 134 135 /** 136 * Outputs the replacement for an invalid character. Subclasses can 137 * override this method to use a custom replacement. 138 * 139 * @param output where the replacement is written to 140 * @throws SAXException if the replacement could not be written 141 */ 142 protected void writeReplacement(Output output) throws SAXException { 143 output.write(REPLACEMENT, 0, REPLACEMENT.length); 144 } 145 146 //------------------------------------------------------< ContentHandler > 147 148 @Override 149 public void characters(char[] ch, int start, int length) 150 throws SAXException { 151 filter(ch, start, length, charactersOutput); 152 } 153 154 @Override 155 public void ignorableWhitespace(char[] ch, int start, int length) 156 throws SAXException { 157 filter(ch, start, length, ignorableWhitespaceOutput); 158 } 159 160 }