View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.tika.parser.html;
18  
19  import java.net.MalformedURLException;
20  import java.net.URL;
21  
22  import org.apache.tika.metadata.Metadata;
23  import org.apache.tika.sax.TextContentHandler;
24  import org.apache.tika.sax.XHTMLContentHandler;
25  import org.xml.sax.Attributes;
26  import org.xml.sax.ContentHandler;
27  import org.xml.sax.SAXException;
28  
29  class HtmlHandler extends TextContentHandler {
30  
31      private final HtmlMapper mapper;
32  
33      private final XHTMLContentHandler xhtml;
34  
35      private final Metadata metadata;
36  
37      private int bodyLevel = 0;
38  
39      private int discardLevel = 0;
40  
41      private int titleLevel = 0;
42  
43      private final StringBuilder title = new StringBuilder();
44  
45      private HtmlHandler(
46              HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
47          super(xhtml);
48          this.mapper = mapper;
49          this.xhtml = xhtml;
50          this.metadata = metadata;
51  
52          // Try to determine the default base URL, if one has not been given
53          if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
54              String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
55              if (name != null) {
56                  name = name.trim();
57                  try {
58                      new URL(name); // test URL format
59                      metadata.set(Metadata.CONTENT_LOCATION, name);
60                  } catch (MalformedURLException e) {
61                      // The resource name is not a valid URL, ignore it
62                  }
63              }
64          }
65      }
66  
67      public HtmlHandler(
68              HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
69          this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
70      }
71  
72      @Override
73      public void startElement(
74              String uri, String local, String name, Attributes atts)
75              throws SAXException {
76          if ("TITLE".equals(name) || titleLevel > 0) {
77              titleLevel++;
78          }
79          if ("BODY".equals(name) || bodyLevel > 0) {
80              bodyLevel++;
81          }
82          if (mapper.isDiscardElement(name) || discardLevel > 0) {
83              discardLevel++;
84          }
85  
86          if (bodyLevel == 0 && discardLevel == 0) {
87              if ("META".equals(name) && atts.getValue("content") != null) {
88                  if (atts.getValue("http-equiv") != null) {
89                      metadata.set(
90                              atts.getValue("http-equiv"),
91                              atts.getValue("content"));
92                  }
93                  if (atts.getValue("name") != null) {
94                      metadata.set(
95                              atts.getValue("name"),
96                              atts.getValue("content"));
97                  }
98              } else if ("BASE".equals(name) && atts.getValue("href") != null) {
99                  metadata.set(
100                         Metadata.CONTENT_LOCATION,
101                         resolve(atts.getValue("href").trim()));
102             }
103         }
104 
105         if (bodyLevel > 0 && discardLevel == 0) {
106             String safe = mapper.mapSafeElement(name);
107             if (safe != null) {
108                 xhtml.startElement(safe);
109             } else if ("A".equals(name)) {
110                 String href = atts.getValue("href");
111                 if (href != null) {
112                     xhtml.startElement("a", "href", resolve(href.trim()));
113                 } else {
114                     String anchor = atts.getValue("name");
115                     if (anchor != null) {
116                         xhtml.startElement("a", "name", anchor.trim());
117                     } else {
118                         xhtml.startElement("a");
119                     }
120                 }
121             }
122         }
123 
124         title.setLength(0);
125     }
126 
127     @Override
128     public void endElement(
129             String uri, String local, String name) throws SAXException {
130         if (bodyLevel > 0 && discardLevel == 0) {
131             String safe = mapper.mapSafeElement(name);
132             if (safe != null) {
133                 xhtml.endElement(safe);
134             } else if ("A".equals(name)) {
135                 xhtml.endElement("a");
136             } else if (XHTMLContentHandler.ENDLINE.contains(
137                     name.toLowerCase())) {
138                 // TIKA-343: Replace closing block tags (and <br/>) with a
139                 // newline unless the HtmlMapper above has already mapped
140                 // them to something else
141                 xhtml.newline();
142             }
143         }
144 
145         if (titleLevel > 0) {
146             titleLevel--;
147             if (titleLevel == 0) {
148                 metadata.set(Metadata.TITLE, title.toString().trim());
149             }
150         }
151         if (bodyLevel > 0) {
152             bodyLevel--;
153         }
154         if (discardLevel > 0) {
155             discardLevel--;
156         }
157     }
158 
159     @Override
160     public void characters(char[] ch, int start, int length)
161             throws SAXException {
162         if (titleLevel > 0 && bodyLevel == 0) {
163             title.append(ch, start, length);
164         }
165         if (bodyLevel > 0 && discardLevel == 0) {
166             super.characters(ch, start, length);
167         }
168     }
169 
170     @Override
171     public void ignorableWhitespace(char[] ch, int start, int length)
172             throws SAXException {
173         if (bodyLevel > 0 && discardLevel == 0) {
174             super.ignorableWhitespace(ch, start, length);
175         }
176     }
177 
178     private String resolve(String url) {
179         // Return the URL as-is if no base URL is available
180         if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
181             return url;
182         }
183 
184         // Check for common non-hierarchical and pseudo URI prefixes
185         String lower = url.toLowerCase();
186         if (lower.startsWith("urn:")
187                 || lower.startsWith("mailto:")
188                 || lower.startsWith("tel:")
189                 || lower.startsWith("data:")
190                 || lower.startsWith("javascript:")
191                 || lower.startsWith("about:")) {
192             return url;
193         }
194 
195         try {
196             URL base = new URL(metadata.get(Metadata.CONTENT_LOCATION).trim());
197 
198             // We need to handle one special case, where the relativeUrl is
199             // just a query string (like "?pid=1"), and the baseUrl doesn't
200             // end with a '/'. In that case, the URL class removes the last
201             // portion of the path, which we don't want.
202             String path = base.getPath();
203             if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
204                 return new URL(
205                         base.getProtocol(), base.getHost(), base.getPort(),
206                         base.getPath() + url).toExternalForm();
207             } else {
208                 return new URL(base, url).toExternalForm();
209             }
210         } catch (MalformedURLException e) {
211             // Unknown or broken format; just return the URL as received.
212             return url;
213         }
214     }
215 
216 }