1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.tika.parser.html;
18
19 import java.net.MalformedURLException;
20 import java.net.URL;
21
22 import org.apache.tika.metadata.Metadata;
23 import org.apache.tika.sax.TextContentHandler;
24 import org.apache.tika.sax.XHTMLContentHandler;
25 import org.xml.sax.Attributes;
26 import org.xml.sax.ContentHandler;
27 import org.xml.sax.SAXException;
28
29 class HtmlHandler extends TextContentHandler {
30
31 private final HtmlMapper mapper;
32
33 private final XHTMLContentHandler xhtml;
34
35 private final Metadata metadata;
36
37 private int bodyLevel = 0;
38
39 private int discardLevel = 0;
40
41 private int titleLevel = 0;
42
43 private final StringBuilder title = new StringBuilder();
44
45 private HtmlHandler(
46 HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
47 super(xhtml);
48 this.mapper = mapper;
49 this.xhtml = xhtml;
50 this.metadata = metadata;
51
52
53 if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
54 String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
55 if (name != null) {
56 name = name.trim();
57 try {
58 new URL(name);
59 metadata.set(Metadata.CONTENT_LOCATION, name);
60 } catch (MalformedURLException e) {
61
62 }
63 }
64 }
65 }
66
67 public HtmlHandler(
68 HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
69 this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
70 }
71
72 @Override
73 public void startElement(
74 String uri, String local, String name, Attributes atts)
75 throws SAXException {
76 if ("TITLE".equals(name) || titleLevel > 0) {
77 titleLevel++;
78 }
79 if ("BODY".equals(name) || bodyLevel > 0) {
80 bodyLevel++;
81 }
82 if (mapper.isDiscardElement(name) || discardLevel > 0) {
83 discardLevel++;
84 }
85
86 if (bodyLevel == 0 && discardLevel == 0) {
87 if ("META".equals(name) && atts.getValue("content") != null) {
88 if (atts.getValue("http-equiv") != null) {
89 metadata.set(
90 atts.getValue("http-equiv"),
91 atts.getValue("content"));
92 }
93 if (atts.getValue("name") != null) {
94 metadata.set(
95 atts.getValue("name"),
96 atts.getValue("content"));
97 }
98 } else if ("BASE".equals(name) && atts.getValue("href") != null) {
99 metadata.set(
100 Metadata.CONTENT_LOCATION,
101 resolve(atts.getValue("href").trim()));
102 }
103 }
104
105 if (bodyLevel > 0 && discardLevel == 0) {
106 String safe = mapper.mapSafeElement(name);
107 if (safe != null) {
108 xhtml.startElement(safe);
109 } else if ("A".equals(name)) {
110 String href = atts.getValue("href");
111 if (href != null) {
112 xhtml.startElement("a", "href", resolve(href.trim()));
113 } else {
114 String anchor = atts.getValue("name");
115 if (anchor != null) {
116 xhtml.startElement("a", "name", anchor.trim());
117 } else {
118 xhtml.startElement("a");
119 }
120 }
121 }
122 }
123
124 title.setLength(0);
125 }
126
127 @Override
128 public void endElement(
129 String uri, String local, String name) throws SAXException {
130 if (bodyLevel > 0 && discardLevel == 0) {
131 String safe = mapper.mapSafeElement(name);
132 if (safe != null) {
133 xhtml.endElement(safe);
134 } else if ("A".equals(name)) {
135 xhtml.endElement("a");
136 } else if (XHTMLContentHandler.ENDLINE.contains(
137 name.toLowerCase())) {
138
139
140
141 xhtml.newline();
142 }
143 }
144
145 if (titleLevel > 0) {
146 titleLevel--;
147 if (titleLevel == 0) {
148 metadata.set(Metadata.TITLE, title.toString().trim());
149 }
150 }
151 if (bodyLevel > 0) {
152 bodyLevel--;
153 }
154 if (discardLevel > 0) {
155 discardLevel--;
156 }
157 }
158
159 @Override
160 public void characters(char[] ch, int start, int length)
161 throws SAXException {
162 if (titleLevel > 0 && bodyLevel == 0) {
163 title.append(ch, start, length);
164 }
165 if (bodyLevel > 0 && discardLevel == 0) {
166 super.characters(ch, start, length);
167 }
168 }
169
170 @Override
171 public void ignorableWhitespace(char[] ch, int start, int length)
172 throws SAXException {
173 if (bodyLevel > 0 && discardLevel == 0) {
174 super.ignorableWhitespace(ch, start, length);
175 }
176 }
177
178 private String resolve(String url) {
179
180 if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
181 return url;
182 }
183
184
185 String lower = url.toLowerCase();
186 if (lower.startsWith("urn:")
187 || lower.startsWith("mailto:")
188 || lower.startsWith("tel:")
189 || lower.startsWith("data:")
190 || lower.startsWith("javascript:")
191 || lower.startsWith("about:")) {
192 return url;
193 }
194
195 try {
196 URL base = new URL(metadata.get(Metadata.CONTENT_LOCATION).trim());
197
198
199
200
201
202 String path = base.getPath();
203 if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
204 return new URL(
205 base.getProtocol(), base.getHost(), base.getPort(),
206 base.getPath() + url).toExternalForm();
207 } else {
208 return new URL(base, url).toExternalForm();
209 }
210 } catch (MalformedURLException e) {
211
212 return url;
213 }
214 }
215
216 }