View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.html;
18  
19  /**
20   * HTML mapper used to make incoming HTML documents easier to handle by
21   * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
22   * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
23   * that wants to customize this mapping can place a custom HtmlMapper instance
24   * into the parse context.
25   *
26   * @since Apache Tika 0.6
27   */
28  public interface HtmlMapper {
29  
30      /**
31       * Maps "safe" HTML element names to semantic XHTML equivalents. If the
32       * given element is unknown or deemed unsafe for inclusion in the parse
33       * output, then this method returns <code>null</code> and the element
34       * will be ignored but the content inside it is still processed. See
35       * the {@link #isDiscardElement(String)} method for a way to discard
36       * the entire contents of an element.
37       *
38       * @param name HTML element name (upper case)
39       * @return XHTML element name (lower case), or
40       *         <code>null</code> if the element is unsafe 
41       */
42      String mapSafeElement(String name);
43  
44      /**
45       * Checks whether all content within the given HTML element should be
46       * discarded instead of including it in the parse output.
47       *
48       * @param name HTML element name (upper case)
49       * @return <code>true</code> if content inside the named element
50       *         should be ignored, <code>false</code> otherwise
51       */
52      boolean isDiscardElement(String name);
53  
54  }