1 /** 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.sax; 18 19 import java.io.IOException; 20 import java.io.OutputStream; 21 import java.io.OutputStreamWriter; 22 import java.io.StringWriter; 23 import java.io.Writer; 24 25 import org.xml.sax.SAXException; 26 import org.xml.sax.helpers.DefaultHandler; 27 28 /** 29 * SAX event handler that writes all character content out to 30 * a {@link Writer} character stream. 31 */ 32 public class WriteOutContentHandler extends DefaultHandler { 33 34 /** 35 * The character stream. 36 */ 37 private final Writer writer; 38 39 /** 40 * The maximum number of characters to write to the character stream. 41 * Set to -1 for no limit. 42 */ 43 private final int writeLimit; 44 45 /** 46 * Number of characters written so far. 47 */ 48 private int writeCount = 0; 49 50 private WriteOutContentHandler(Writer writer, int writeLimit) { 51 this.writer = writer; 52 this.writeLimit = writeLimit; 53 } 54 55 /** 56 * Creates a content handler that writes character events to 57 * the given writer. 58 * 59 * @param writer writer 60 */ 61 public WriteOutContentHandler(Writer writer) { 62 this(writer, -1); 63 } 64 65 /** 66 * Creates a content handler that writes character events to 67 * the given output stream using the default encoding. 68 * 69 * @param stream output stream 70 */ 71 public WriteOutContentHandler(OutputStream stream) { 72 this(new OutputStreamWriter(stream)); 73 } 74 75 /** 76 * Creates a content handler that writes character events 77 * to an internal string buffer. Use the {@link #toString()} 78 * method to access the collected character content. 79 * <p> 80 * The internal string buffer is bounded at the given number of characters. 81 * If this write limit is reached, then a {@link SAXException} is thrown. 82 * The {@link #isWriteLimitReached(Throwable)} method can be used to 83 * detect this case. 84 * 85 * @since Apache Tika 0.7 86 * @param writeLimit maximum number of characters to include in the string, 87 * or -1 to disable the write limit 88 */ 89 public WriteOutContentHandler(int writeLimit) { 90 this(new StringWriter(), writeLimit); 91 } 92 93 /** 94 * Creates a content handler that writes character events 95 * to an internal string buffer. Use the {@link #toString()} 96 * method to access the collected character content. 97 * <p> 98 * The internal string buffer is bounded at 100k characters. If this 99 * write limit is reached, then a {@link SAXException} is thrown. The 100 * {@link #isWriteLimitReached(Throwable)} method can be used to detect 101 * this case. 102 */ 103 public WriteOutContentHandler() { 104 this(100 * 1000); 105 } 106 107 /** 108 * Writes the given characters to the given character stream. 109 */ 110 @Override 111 public void characters(char[] ch, int start, int length) 112 throws SAXException { 113 try { 114 if (writeLimit == -1 || writeCount + length <= writeLimit) { 115 writer.write(ch, start, length); 116 writeCount += length; 117 } else { 118 writer.write(ch, start, writeLimit - writeCount); 119 writeCount = writeLimit; 120 throw new WriteLimitReachedException(); 121 } 122 } catch (IOException e) { 123 throw new SAXException("Error writing out character content", e); 124 } 125 } 126 127 128 /** 129 * Writes the given ignorable characters to the given character stream. 130 */ 131 @Override 132 public void ignorableWhitespace(char[] ch, int start, int length) 133 throws SAXException { 134 characters(ch, start, length); 135 } 136 137 /** 138 * Flushes the character stream so that no characters are forgotten 139 * in internal buffers. 140 * 141 * @see <a href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a> 142 * @throws SAXException if the stream can not be flushed 143 */ 144 @Override 145 public void endDocument() throws SAXException { 146 try { 147 writer.flush(); 148 } catch (IOException e) { 149 throw new SAXException("Error flushing character output", e); 150 } 151 } 152 153 /** 154 * Returns the contents of the internal string buffer where 155 * all the received characters have been collected. Only works 156 * when this object was constructed using the empty default 157 * constructor or by passing a {@link StringWriter} to the 158 * other constructor. 159 */ 160 @Override 161 public String toString() { 162 return writer.toString(); 163 } 164 165 /** 166 * Checks whether the given exception (or any of it's root causes) was 167 * thrown by this handler as a signal of reaching the write limit. 168 * 169 * @since Apache Tika 0.7 170 * @param t throwable 171 * @return <code>true</code> if the write limit was reached, 172 * <code>false</code> otherwise 173 */ 174 public boolean isWriteLimitReached(Throwable t) { 175 if (t instanceof WriteLimitReachedException) { 176 return this == ((WriteLimitReachedException) t).getSource(); 177 } else { 178 return t.getCause() != null && isWriteLimitReached(t.getCause()); 179 } 180 } 181 182 /** 183 * The exception used as a signal when the write limit has been reached. 184 */ 185 private class WriteLimitReachedException extends SAXException { 186 187 public WriteOutContentHandler getSource() { 188 return WriteOutContentHandler.this; 189 } 190 191 } 192 193 }