1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.tika.parser.html; 18 19 /** 20 * HTML mapper used to make incoming HTML documents easier to handle by 21 * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from 22 * the parse context and uses it to map parsed HTML to "safe" XHTML. A client 23 * that wants to customize this mapping can place a custom HtmlMapper instance 24 * into the parse context. 25 * 26 * @since Apache Tika 0.6 27 */ 28 public interface HtmlMapper { 29 30 /** 31 * Maps "safe" HTML element names to semantic XHTML equivalents. If the 32 * given element is unknown or deemed unsafe for inclusion in the parse 33 * output, then this method returns <code>null</code> and the element 34 * will be ignored but the content inside it is still processed. See 35 * the {@link #isDiscardElement(String)} method for a way to discard 36 * the entire contents of an element. 37 * 38 * @param name HTML element name (upper case) 39 * @return XHTML element name (lower case), or 40 * <code>null</code> if the element is unsafe 41 */ 42 String mapSafeElement(String name); 43 44 /** 45 * Checks whether all content within the given HTML element should be 46 * discarded instead of including it in the parse output. 47 * 48 * @param name HTML element name (upper case) 49 * @return <code>true</code> if content inside the named element 50 * should be ignored, <code>false</code> otherwise 51 */ 52 boolean isDiscardElement(String name); 53 54 }