View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.html;
18  
19  import java.io.BufferedInputStream;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InputStreamReader;
23  import java.nio.charset.Charset;
24  import java.util.Arrays;
25  import java.util.Collections;
26  import java.util.HashSet;
27  import java.util.Set;
28  import java.util.regex.Matcher;
29  import java.util.regex.Pattern;
30  
31  import org.apache.tika.exception.TikaException;
32  import org.apache.tika.io.CloseShieldInputStream;
33  import org.apache.tika.metadata.Metadata;
34  import org.apache.tika.mime.MediaType;
35  import org.apache.tika.parser.ParseContext;
36  import org.apache.tika.parser.Parser;
37  import org.apache.tika.parser.txt.CharsetDetector;
38  import org.apache.tika.parser.txt.CharsetMatch;
39  import org.xml.sax.ContentHandler;
40  import org.xml.sax.InputSource;
41  import org.xml.sax.SAXException;
42  
43  /**
44   * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
45   * and post-processes the events to produce XHTML and metadata expected by
46   * Tika clients.
47   */
48  public class HtmlParser implements Parser {
49  
50      private static final Set<MediaType> SUPPORTED_TYPES =
51          Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
52                  MediaType.text("html"),
53                  MediaType.application("xhtml+xml"),
54                  MediaType.application("vnd.wap.xhtml+xml"),
55                  MediaType.application("x-asp"))));
56  
57      /**
58       * The default HTML mapping.
59       */
60      private static final HtmlMapper mapper = new DefaultHtmlMapper();
61  
62      // Use the widest, most common charset as our default.
63      private static final String DEFAULT_CHARSET = "windows-1252";
64      // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
65      private static final int META_TAG_BUFFER_SIZE = 8192;
66      private static final Pattern HTTP_EQUIV_PATTERN = Pattern.compile(
67                      "(?is)<meta\\s+http-equiv\\s*=\\s*['\\\"]\\s*" +
68                      "Content-Type['\\\"]\\s+content\\s*=\\s*['\\\"]" +
69                      "([^'\\\"]+)['\\\"]");
70  
71      public Set<MediaType> getSupportedTypes(ParseContext context) {
72          return SUPPORTED_TYPES;
73      }
74  
75      /**
76       * TIKA-332: Check for meta http-equiv tag with charset info in
77       * HTML content.
78       * <p>
79       * TODO: Move this into core, along with CharsetDetector
80       */ 
81      private String getEncoding(InputStream stream, Metadata metadata) throws IOException {
82          stream.mark(META_TAG_BUFFER_SIZE);
83          char[] buffer = new char[META_TAG_BUFFER_SIZE];
84          InputStreamReader isr = new InputStreamReader(stream, "us-ascii");
85          int bufferSize = isr.read(buffer);
86          stream.reset();
87  
88          if (bufferSize != -1) {
89              String metaString = new String(buffer, 0, bufferSize);
90              Matcher m = HTTP_EQUIV_PATTERN.matcher(metaString);
91              if (m.find()) {
92                  // TIKA-349: flexible handling of attributes
93                  // We have one or more x or x=y attributes, separated by ';'
94                  String[] attrs = m.group(1).split(";");
95                  for (String attr : attrs) {
96                      String[] keyValue = attr.trim().split("=");
97                      if ((keyValue.length == 2) && keyValue[0].equalsIgnoreCase("charset")) {
98                          String charset = keyValue[1];
99                          if (Charset.isSupported(charset)) {
100                             metadata.set(Metadata.CONTENT_ENCODING, charset);
101                             return charset;
102                         }
103                     }
104                 }
105             }
106         }
107 
108         // No charset in a meta http-equiv tag, see if it's in the passed content-encoding
109         // hint, or the passed content-type hint.
110         CharsetDetector detector = new CharsetDetector();
111         String incomingCharset = metadata.get(Metadata.CONTENT_ENCODING);
112         String incomingType = metadata.get(Metadata.CONTENT_TYPE);
113         if (incomingCharset == null && incomingType != null) {
114             // TIKA-341: Use charset in content-type
115             MediaType mt = MediaType.parse(incomingType);
116             if (mt != null) {
117                 String charset = mt.getParameters().get("charset");
118                 if ((charset != null) && Charset.isSupported(charset)) {
119                     incomingCharset = charset;
120                 }
121             }
122         }
123 
124         if (incomingCharset != null) {
125             detector.setDeclaredEncoding(incomingCharset);
126         }
127 
128         // TIKA-341 without enabling input filtering (stripping of tags) the
129         // short HTML tests don't work well.
130         detector.enableInputFilter(true);
131         detector.setText(stream);
132         for (CharsetMatch match : detector.detectAll()) {
133             if (Charset.isSupported(match.getName())) {
134                 metadata.set(Metadata.CONTENT_ENCODING, match.getName());
135 
136                 // TIKA-339: Don't set language, as it's typically not a very good
137                 // guess, and it can create ambiguity if another (better) language
138                 // value is specified by a meta tag in the HTML (or via HTTP response
139                 // header).
140                 /*
141                 String language = match.getLanguage();
142                 if (language != null) {
143                     metadata.set(Metadata.CONTENT_LANGUAGE, match.getLanguage());
144                     metadata.set(Metadata.LANGUAGE, match.getLanguage());
145                 }
146                 */
147                 
148                 break;
149             }
150         }
151 
152         String encoding = metadata.get(Metadata.CONTENT_ENCODING);
153         if (encoding == null) {
154             if (Charset.isSupported(DEFAULT_CHARSET)) {
155                 encoding = DEFAULT_CHARSET;
156             } else {
157                 encoding = Charset.defaultCharset().name();
158             }
159             
160             metadata.set(Metadata.CONTENT_ENCODING, encoding);
161         }
162 
163         return encoding;
164     }
165 
166     public void parse(
167             InputStream stream, ContentHandler handler,
168             Metadata metadata, ParseContext context)
169             throws IOException, SAXException, TikaException {
170         // The getEncoding() method depends on the mark feature
171         if (!stream.markSupported()) {
172             stream = new BufferedInputStream(stream);
173         }
174 
175         // Protect the stream from being closed by CyberNeko
176         // TODO: Is this still needed, given our use of TagSoup?
177         stream = new CloseShieldInputStream(stream);
178 
179         // Prepare the input source using the encoding hint if available
180         InputSource source = new InputSource(stream); 
181         source.setEncoding(getEncoding(stream, metadata));
182 
183         // Get the HTML mapper from the parse context
184         HtmlMapper mapper =
185             context.get(HtmlMapper.class, new HtmlParserMapper());
186 
187         // Parse the HTML document
188         org.ccil.cowan.tagsoup.Parser parser =
189             new org.ccil.cowan.tagsoup.Parser();
190         parser.setContentHandler(new XHTMLDowngradeHandler(
191                 new HtmlHandler(mapper, handler, metadata)));
192         parser.parse(source);
193     }
194 
195     /**
196      * @deprecated This method will be removed in Apache Tika 1.0.
197      */
198     public void parse(
199             InputStream stream, ContentHandler handler, Metadata metadata)
200     throws IOException, SAXException, TikaException {
201         parse(stream, handler, metadata, new ParseContext());
202     }
203 
204     /**
205      * Maps "safe" HTML element names to semantic XHTML equivalents. If the
206      * given element is unknown or deemed unsafe for inclusion in the parse
207      * output, then this method returns <code>null</code> and the element
208      * will be ignored but the content inside it is still processed. See
209      * the {@link #isDiscardElement(String)} method for a way to discard
210      * the entire contents of an element.
211      * <p>
212      * Subclasses can override this method to customize the default mapping.
213      *
214      * @deprecated Use the {@link HtmlMapper} mechanism to customize
215      *             the HTML mapping. This method will be removed in Tika 1.0.
216      * @since Apache Tika 0.5
217      * @param name HTML element name (upper case)
218      * @return XHTML element name (lower case), or
219      *         <code>null</code> if the element is unsafe 
220      */
221     protected String mapSafeElement(String name) {
222         return mapper.mapSafeElement(name);
223     }
224 
225     /**
226      * Checks whether all content within the given HTML element should be
227      * discarded instead of including it in the parse output. Subclasses
228      * can override this method to customize the set of discarded elements.
229      *
230      * @deprecated Use the {@link HtmlMapper} mechanism to customize
231      *             the HTML mapping. This method will be removed in Tika 1.0.
232      * @since Apache Tika 0.5
233      * @param name HTML element name (upper case)
234      * @return <code>true</code> if content inside the named element
235      *         should be ignored, <code>false</code> otherwise
236      */
237     protected boolean isDiscardElement(String name) {
238         return "STYLE".equals(name) || "SCRIPT".equals(name);
239     }
240 
241     /**
242      * Adapter class that maintains backwards compatibility with the
243      * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
244      * directly would require those methods to be public, which would break
245      * backwards compatibility with subclasses.
246      *
247      * @deprecated Use the {@link HtmlMapper} mechanism to customize
248      *             the HTML mapping. This class will be removed in Tika 1.0.
249      */
250     private class HtmlParserMapper implements HtmlMapper {
251         public String mapSafeElement(String name) {
252             return HtmlParser.this.mapSafeElement(name);
253         }
254         public boolean isDiscardElement(String name) {
255             return HtmlParser.this.isDiscardElement(name);
256         }
257     }
258 
259 }