View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.sax;
18  
19  import java.io.OutputStream;
20  import java.io.Writer;
21  
22  import org.apache.tika.sax.xpath.Matcher;
23  import org.apache.tika.sax.xpath.MatchingContentHandler;
24  import org.apache.tika.sax.xpath.XPathParser;
25  import org.xml.sax.ContentHandler;
26  import org.xml.sax.SAXException;
27  
28  /**
29   * Content handler decorator that only passes everything inside
30   * the XHTML <body/> tag to the underlying handler. Note that
31   * the &lt;body/&gt; tag itself is <em>not</em> passed on.
32   */
33  public class BodyContentHandler extends ContentHandlerDecorator {
34  
35      /**
36       * XHTML XPath parser.
37       */
38      private static final XPathParser PARSER =
39          new XPathParser("xhtml", XHTMLContentHandler.XHTML);
40  
41      /**
42       * The XPath matcher used to select the XHTML body contents.
43       */
44      private static final Matcher MATCHER =
45          PARSER.parse("/xhtml:html/xhtml:body/descendant:node()");
46  
47      /**
48       * Creates a content handler that passes all XHTML body events to the
49       * given underlying content handler.
50       *
51       * @param handler content handler
52       */
53      public BodyContentHandler(ContentHandler handler) {
54          super(new MatchingContentHandler(handler, MATCHER));
55      }
56  
57      /**
58       * Creates a content handler that writes XHTML body character events to
59       * the given writer.
60       *
61       * @param writer writer
62       */
63      public BodyContentHandler(Writer writer) {
64          this(new WriteOutContentHandler(writer));
65      }
66  
67      /**
68       * Creates a content handler that writes XHTML body character events to
69       * the given output stream using the default encoding.
70       *
71       * @param stream output stream
72       */
73      public BodyContentHandler(OutputStream stream) {
74          this(new WriteOutContentHandler(stream));
75      }
76  
77      /**
78       * Creates a content handler that writes XHTML body character events to
79       * an internal string buffer. The contents of the buffer can be retrieved
80       * using the {@link #toString()} method.
81       * <p>
82       * The internal string buffer is bounded at the given number of characters.
83       * If this write limit is reached, then a {@link SAXException} is thrown.
84       *
85       * @since Apache Tika 0.7
86       * @param writeLimit maximum number of characters to include in the string,
87       *                   or -1 to disable the write limit
88       */
89      public BodyContentHandler(int writeLimit) {
90          this(new WriteOutContentHandler(writeLimit));
91      }
92  
93      /**
94       * Creates a content handler that writes XHTML body character events to
95       * an internal string buffer. The contents of the buffer can be retrieved
96       * using the {@link #toString()} method.
97       * <p>
98       * The internal string buffer is bounded at 100k characters. If this write
99       * limit is reached, then a {@link SAXException} is thrown.
100      */
101     public BodyContentHandler() {
102         this(new WriteOutContentHandler());
103     }
104 
105 }