View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.sax;
18  
19  import java.io.IOException;
20  import java.io.OutputStream;
21  import java.io.OutputStreamWriter;
22  import java.io.StringWriter;
23  import java.io.Writer;
24  
25  import org.xml.sax.SAXException;
26  import org.xml.sax.helpers.DefaultHandler;
27  
28  /**
29   * SAX event handler that writes all character content out to
30   * a {@link Writer} character stream.
31   */
32  public class WriteOutContentHandler extends DefaultHandler {
33  
34      /**
35       * The character stream.
36       */
37      private final Writer writer;
38  
39      /**
40       * The maximum number of characters to write to the character stream.
41       * Set to -1 for no limit.
42       */
43      private final int writeLimit;
44  
45      /**
46       * Number of characters written so far.
47       */
48      private int writeCount = 0;
49  
50      private WriteOutContentHandler(Writer writer, int writeLimit) {
51          this.writer = writer;
52          this.writeLimit = writeLimit;
53      }
54  
55      /**
56       * Creates a content handler that writes character events to
57       * the given writer.
58       *
59       * @param writer writer
60       */
61      public WriteOutContentHandler(Writer writer) {
62          this(writer, -1);
63      }
64  
65      /**
66       * Creates a content handler that writes character events to
67       * the given output stream using the default encoding.
68       *
69       * @param stream output stream
70       */
71      public WriteOutContentHandler(OutputStream stream) {
72          this(new OutputStreamWriter(stream));
73      }
74  
75      /**
76       * Creates a content handler that writes character events
77       * to an internal string buffer. Use the {@link #toString()}
78       * method to access the collected character content.
79       * <p>
80       * The internal string buffer is bounded at the given number of characters.
81       * If this write limit is reached, then a {@link SAXException} is thrown.
82       * The {@link #isWriteLimitReached(Throwable)} method can be used to
83       * detect this case.
84       *
85       * @since Apache Tika 0.7
86       * @param writeLimit maximum number of characters to include in the string,
87       *                   or -1 to disable the write limit
88       */
89      public WriteOutContentHandler(int writeLimit) {
90          this(new StringWriter(), writeLimit);
91      }
92  
93      /**
94       * Creates a content handler that writes character events
95       * to an internal string buffer. Use the {@link #toString()}
96       * method to access the collected character content.
97       * <p>
98       * The internal string buffer is bounded at 100k characters. If this
99       * write limit is reached, then a {@link SAXException} is thrown. The
100      * {@link #isWriteLimitReached(Throwable)} method can be used to detect
101      * this case.
102      */
103     public WriteOutContentHandler() {
104         this(100 * 1000);
105     }
106 
107     /**
108      * Writes the given characters to the given character stream.
109      */
110     @Override
111     public void characters(char[] ch, int start, int length)
112             throws SAXException {
113         try {
114             if (writeLimit == -1 || writeCount + length <= writeLimit) {
115                 writer.write(ch, start, length);
116                 writeCount += length;
117             } else {
118                 writer.write(ch, start, writeLimit - writeCount);
119                 writeCount = writeLimit;
120                 throw new WriteLimitReachedException();
121             }
122         } catch (IOException e) {
123             throw new SAXException("Error writing out character content", e);
124         }
125     }
126 
127 
128     /**
129      * Writes the given ignorable characters to the given character stream.
130      */
131     @Override
132     public void ignorableWhitespace(char[] ch, int start, int length)
133             throws SAXException {
134         characters(ch, start, length);
135     }
136 
137     /**
138      * Flushes the character stream so that no characters are forgotten
139      * in internal buffers.
140      *
141      * @see <a href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a>
142      * @throws SAXException if the stream can not be flushed
143      */
144     @Override
145     public void endDocument() throws SAXException {
146         try {
147             writer.flush();
148         } catch (IOException e) {
149             throw new SAXException("Error flushing character output", e);
150         }
151     }
152 
153     /**
154      * Returns the contents of the internal string buffer where
155      * all the received characters have been collected. Only works
156      * when this object was constructed using the empty default
157      * constructor or by passing a {@link StringWriter} to the
158      * other constructor.
159      */
160     @Override
161     public String toString() {
162         return writer.toString();
163     }
164 
165     /**
166      * Checks whether the given exception (or any of it's root causes) was
167      * thrown by this handler as a signal of reaching the write limit.
168      *
169      * @since Apache Tika 0.7
170      * @param t throwable
171      * @return <code>true</code> if the write limit was reached,
172      *         <code>false</code> otherwise
173      */
174     public boolean isWriteLimitReached(Throwable t) {
175         if (t instanceof WriteLimitReachedException) {
176             return this == ((WriteLimitReachedException) t).getSource();
177         } else {
178             return t.getCause() != null && isWriteLimitReached(t.getCause());
179         }
180     }
181 
182     /**
183      * The exception used as a signal when the write limit has been reached.
184      */
185     private class WriteLimitReachedException extends SAXException {
186 
187         public WriteOutContentHandler getSource() {
188             return WriteOutContentHandler.this;
189         }
190 
191     }
192 
193 }