View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.sax;
18  
19  import org.xml.sax.ContentHandler;
20  import org.xml.sax.SAXException;
21  
22  /**
23   * Content handler decorator that makes sure that the character events
24   * ({@link #characters(char[], int, int)} or
25   * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
26   * content handler contain only valid XML characters. All invalid characters
27   * are replaced with spaces.
28   * <p>
29   * The XML standard defines the following Unicode character ranges as
30   * valid XML characters:
31   * <pre>
32   * #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
33   * </pre>
34   * <p>
35   * Note that currently this class only detects those invalid characters whose
36   * UTF-16 representation fits a single char. Also, this class does not ensure
37   * that the UTF-16 encoding of incoming characters is correct.
38   */
39  public class SafeContentHandler extends ContentHandlerDecorator {
40  
41      /**
42       * Replacement for invalid characters.
43       */
44      private static final char[] REPLACEMENT = new char[] { ' ' };
45  
46      /**
47       * Internal interface that allows both character and
48       * ignorable whitespace content to be filtered the same way.
49       */
50      protected interface Output {
51          void write(char[] ch, int start, int length) throws SAXException;
52      }
53  
54      /**
55       * Output through the {@link ContentHandler#characters(char[], int, int)}
56       * method of the decorated content handler.
57       */
58      private final Output charactersOutput = new Output() {
59          public void write(char[] ch, int start, int length)
60                  throws SAXException {
61              SafeContentHandler.super.characters(ch, start, length);
62          }
63      };
64  
65      /**
66       * Output through the
67       * {@link ContentHandler#ignorableWhitespace(char[], int, int)}
68       * method of the decorated content handler.
69       */
70      private final Output ignorableWhitespaceOutput = new Output() {
71          public void write(char[] ch, int start, int length)
72                  throws SAXException {
73              SafeContentHandler.super.ignorableWhitespace(ch, start, length);
74          }
75      };
76  
77      public SafeContentHandler(ContentHandler handler) {
78          super(handler);
79      }
80  
81      /**
82       * Filters and outputs the contents of the given input buffer. Any
83       * invalid characters in the input buffer area handled by sending a
84       * replacement (a space character) to the given output. Any sequences
85       * of valid characters are passed as-is to the given output. 
86       * 
87       * @param ch input buffer
88       * @param start start offset within the buffer
89       * @param length number of characters to read from the buffer
90       * @param output output channel
91       * @throws SAXException if the filtered characters could not be written out
92       */
93      private void filter(char[] ch, int start, int length, Output output)
94              throws SAXException {
95          int end = start + length;
96  
97          for (int i = start; i < end; i++) {
98              if (isInvalid(ch[i])) {
99                  // Output any preceding valid characters
100                 if (i > start) {
101                     output.write(ch, start, i - start);
102                 }
103 
104                 // Output the replacement for this invalid character
105                 writeReplacement(output);
106 
107                 // Continue with the rest of the array
108                 start = i + 1;
109             }
110         }
111 
112         // Output any remaining valid characters
113         output.write(ch, start, end - start);
114     }
115 
116     /**
117      * Checks whether the given character (more accurately a UTF-16 code unit)
118      * is an invalid XML character and should be replaced for output.
119      * Subclasses can override this method to use an alternative definition
120      * of which characters should be replaced in the XML output.
121      *
122      * @param ch character
123      * @return <code>true</code> if the character should be replaced,
124      *         <code>false</code> otherwise
125      */
126     protected boolean isInvalid(char ch) {
127         // TODO: Correct handling of multi-word characters
128         if (ch < 0x20) {
129             return ch != 0x09 && ch != 0x0A && ch != 0x0D;
130         } else {
131             return ch >= 0xFFFE;
132         }
133     }
134 
135     /**
136      * Outputs the replacement for an invalid character. Subclasses can
137      * override this method to use a custom replacement.
138      *
139      * @param output where the replacement is written to
140      * @throws SAXException if the replacement could not be written
141      */
142     protected void writeReplacement(Output output) throws SAXException {
143         output.write(REPLACEMENT, 0, REPLACEMENT.length);
144     }
145 
146     //------------------------------------------------------< ContentHandler >
147 
148     @Override
149     public void characters(char[] ch, int start, int length)
150             throws SAXException {
151         filter(ch, start, length, charactersOutput);
152     }
153 
154     @Override
155     public void ignorableWhitespace(char[] ch, int start, int length)
156             throws SAXException {
157         filter(ch, start, length, ignorableWhitespaceOutput);
158     }
159 
160 }