1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 package org.apache.commons.httpclient;
32
33 import java.io.IOException;
34 import java.io.ObjectInputStream;
35 import java.io.ObjectOutputStream;
36 import java.io.Serializable;
37 import java.util.Arrays;
38 import java.util.Locale;
39 import java.util.BitSet;
40 import java.util.Hashtable;
41
42 import org.apache.commons.codec.DecoderException;
43 import org.apache.commons.codec.net.URLCodec;
44 import org.apache.commons.httpclient.util.EncodingUtil;
45
46 /***
47 * The interface for the URI(Uniform Resource Identifiers) version of RFC 2396.
48 * This class has the purpose of supportting of parsing a URI reference to
49 * extend any specific protocols, the character encoding of the protocol to
50 * be transported and the charset of the document.
51 * <p>
52 * A URI is always in an "escaped" form, since escaping or unescaping a
53 * completed URI might change its semantics.
54 * <p>
55 * Implementers should be careful not to escape or unescape the same string
56 * more than once, since unescaping an already unescaped string might lead to
57 * misinterpreting a percent data character as another escaped character,
58 * or vice versa in the case of escaping an already escaped string.
59 * <p>
60 * In order to avoid these problems, data types used as follows:
61 * <p><blockquote><pre>
62 * URI character sequence: char
63 * octet sequence: byte
64 * original character sequence: String
65 * </pre></blockquote><p>
66 *
67 * So, a URI is a sequence of characters as an array of a char type, which
68 * is not always represented as a sequence of octets as an array of byte.
69 * <p>
70 *
71 * URI Syntactic Components
72 * <p><blockquote><pre>
73 * - In general, written as follows:
74 * Absolute URI = <scheme>:<scheme-specific-part>
75 * Generic URI = <scheme>://<authority><path>?<query>
76 *
77 * - Syntax
78 * absoluteURI = scheme ":" ( hier_part | opaque_part )
79 * hier_part = ( net_path | abs_path ) [ "?" query ]
80 * net_path = "//" authority [ abs_path ]
81 * abs_path = "/" path_segments
82 * </pre></blockquote><p>
83 *
84 * The following examples illustrate URI that are in common use.
85 * <pre>
86 * ftp://ftp.is.co.za/rfc/rfc1808.txt
87 * -- ftp scheme for File Transfer Protocol services
88 * gopher://spinaltap.micro.umn.edu/00/Weather/California/Los%20Angeles
89 * -- gopher scheme for Gopher and Gopher+ Protocol services
90 * http://www.math.uio.no/faq/compression-faq/part1.html
91 * -- http scheme for Hypertext Transfer Protocol services
92 * mailto:mduerst@ifi.unizh.ch
93 * -- mailto scheme for electronic mail addresses
94 * news:comp.infosystems.www.servers.unix
95 * -- news scheme for USENET news groups and articles
96 * telnet://melvyl.ucop.edu/
97 * -- telnet scheme for interactive services via the TELNET Protocol
98 * </pre>
99 * Please, notice that there are many modifications from URL(RFC 1738) and
100 * relative URL(RFC 1808).
101 * <p>
102 * <b>The expressions for a URI</b>
103 * <p><pre>
104 * For escaped URI forms
105 * - URI(char[]) // constructor
106 * - char[] getRawXxx() // method
107 * - String getEscapedXxx() // method
108 * - String toString() // method
109 * <p>
110 * For unescaped URI forms
111 * - URI(String) // constructor
112 * - String getXXX() // method
113 * </pre><p>
114 *
115 * @author <a href="mailto:jericho@apache.org">Sung-Gu</a>
116 * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
117 * @version $Revision: 510581 $ $Date: 2002/03/14 15:14:01
118 */
119 public class URI implements Cloneable, Comparable, Serializable {
120
121
122
123
124 /*** Create an instance as an internal use */
125 protected URI() {
126 }
127
128 /***
129 * Construct a URI from a string with the given charset. The input string can
130 * be either in escaped or unescaped form.
131 *
132 * @param s URI character sequence
133 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
134 * <tt>false</tt> otherwise.
135 * @param charset the charset string to do escape encoding, if required
136 *
137 * @throws URIException If the URI cannot be created.
138 * @throws NullPointerException if input string is <code>null</code>
139 *
140 * @see #getProtocolCharset
141 *
142 * @since 3.0
143 */
144 public URI(String s, boolean escaped, String charset)
145 throws URIException, NullPointerException {
146 protocolCharset = charset;
147 parseUriReference(s, escaped);
148 }
149
150 /***
151 * Construct a URI from a string with the given charset. The input string can
152 * be either in escaped or unescaped form.
153 *
154 * @param s URI character sequence
155 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
156 * <tt>false</tt> otherwise.
157 *
158 * @throws URIException If the URI cannot be created.
159 * @throws NullPointerException if input string is <code>null</code>
160 *
161 * @see #getProtocolCharset
162 *
163 * @since 3.0
164 */
165 public URI(String s, boolean escaped)
166 throws URIException, NullPointerException {
167 parseUriReference(s, escaped);
168 }
169
170 /***
171 * Construct a URI as an escaped form of a character array with the given
172 * charset.
173 *
174 * @param escaped the URI character sequence
175 * @param charset the charset string to do escape encoding
176 * @throws URIException If the URI cannot be created.
177 * @throws NullPointerException if <code>escaped</code> is <code>null</code>
178 * @see #getProtocolCharset
179 *
180 * @deprecated Use #URI(String, boolean, String)
181 */
182 public URI(char[] escaped, String charset)
183 throws URIException, NullPointerException {
184 protocolCharset = charset;
185 parseUriReference(new String(escaped), true);
186 }
187
188
189 /***
190 * Construct a URI as an escaped form of a character array.
191 * An URI can be placed within double-quotes or angle brackets like
192 * "http://test.com/" and <http://test.com/>
193 *
194 * @param escaped the URI character sequence
195 * @throws URIException If the URI cannot be created.
196 * @throws NullPointerException if <code>escaped</code> is <code>null</code>
197 * @see #getDefaultProtocolCharset
198 *
199 * @deprecated Use #URI(String, boolean)
200 */
201 public URI(char[] escaped)
202 throws URIException, NullPointerException {
203 parseUriReference(new String(escaped), true);
204 }
205
206
207 /***
208 * Construct a URI from the given string with the given charset.
209 *
210 * @param original the string to be represented to URI character sequence
211 * It is one of absoluteURI and relativeURI.
212 * @param charset the charset string to do escape encoding
213 * @throws URIException If the URI cannot be created.
214 * @see #getProtocolCharset
215 *
216 * @deprecated Use #URI(String, boolean, String)
217 */
218 public URI(String original, String charset) throws URIException {
219 protocolCharset = charset;
220 parseUriReference(original, false);
221 }
222
223
224 /***
225 * Construct a URI from the given string.
226 * <p><blockquote><pre>
227 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
228 * </pre></blockquote><p>
229 * An URI can be placed within double-quotes or angle brackets like
230 * "http://test.com/" and <http://test.com/>
231 *
232 * @param original the string to be represented to URI character sequence
233 * It is one of absoluteURI and relativeURI.
234 * @throws URIException If the URI cannot be created.
235 * @see #getDefaultProtocolCharset
236 *
237 * @deprecated Use #URI(String, boolean)
238 */
239 public URI(String original) throws URIException {
240 parseUriReference(original, false);
241 }
242
243
244 /***
245 * Construct a general URI from the given components.
246 * <p><blockquote><pre>
247 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
248 * absoluteURI = scheme ":" ( hier_part | opaque_part )
249 * opaque_part = uric_no_slash *uric
250 * </pre></blockquote><p>
251 * It's for absolute URI = <scheme>:<scheme-specific-part>#
252 * <fragment>.
253 *
254 * @param scheme the scheme string
255 * @param schemeSpecificPart scheme_specific_part
256 * @param fragment the fragment string
257 * @throws URIException If the URI cannot be created.
258 * @see #getDefaultProtocolCharset
259 */
260 public URI(String scheme, String schemeSpecificPart, String fragment)
261 throws URIException {
262
263
264 if (scheme == null) {
265 throw new URIException(URIException.PARSING, "scheme required");
266 }
267 char[] s = scheme.toLowerCase().toCharArray();
268 if (validate(s, URI.scheme)) {
269 _scheme = s;
270 } else {
271 throw new URIException(URIException.PARSING, "incorrect scheme");
272 }
273 _opaque = encode(schemeSpecificPart, allowed_opaque_part,
274 getProtocolCharset());
275
276 _is_opaque_part = true;
277 _fragment = fragment == null ? null : fragment.toCharArray();
278 setURI();
279 }
280
281
282 /***
283 * Construct a general URI from the given components.
284 * <p><blockquote><pre>
285 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
286 * absoluteURI = scheme ":" ( hier_part | opaque_part )
287 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
288 * hier_part = ( net_path | abs_path ) [ "?" query ]
289 * </pre></blockquote><p>
290 * It's for absolute URI = <scheme>:<path>?<query>#<
291 * fragment> and relative URI = <path>?<query>#<fragment
292 * >.
293 *
294 * @param scheme the scheme string
295 * @param authority the authority string
296 * @param path the path string
297 * @param query the query string
298 * @param fragment the fragment string
299 * @throws URIException If the new URI cannot be created.
300 * @see #getDefaultProtocolCharset
301 */
302 public URI(String scheme, String authority, String path, String query,
303 String fragment) throws URIException {
304
305
306 StringBuffer buff = new StringBuffer();
307 if (scheme != null) {
308 buff.append(scheme);
309 buff.append(':');
310 }
311 if (authority != null) {
312 buff.append("//");
313 buff.append(authority);
314 }
315 if (path != null) {
316 if ((scheme != null || authority != null)
317 && !path.startsWith("/")) {
318 throw new URIException(URIException.PARSING,
319 "abs_path requested");
320 }
321 buff.append(path);
322 }
323 if (query != null) {
324 buff.append('?');
325 buff.append(query);
326 }
327 if (fragment != null) {
328 buff.append('#');
329 buff.append(fragment);
330 }
331 parseUriReference(buff.toString(), false);
332 }
333
334
335 /***
336 * Construct a general URI from the given components.
337 *
338 * @param scheme the scheme string
339 * @param userinfo the userinfo string
340 * @param host the host string
341 * @param port the port number
342 * @throws URIException If the new URI cannot be created.
343 * @see #getDefaultProtocolCharset
344 */
345 public URI(String scheme, String userinfo, String host, int port)
346 throws URIException {
347
348 this(scheme, userinfo, host, port, null, null, null);
349 }
350
351
352 /***
353 * Construct a general URI from the given components.
354 *
355 * @param scheme the scheme string
356 * @param userinfo the userinfo string
357 * @param host the host string
358 * @param port the port number
359 * @param path the path string
360 * @throws URIException If the new URI cannot be created.
361 * @see #getDefaultProtocolCharset
362 */
363 public URI(String scheme, String userinfo, String host, int port,
364 String path) throws URIException {
365
366 this(scheme, userinfo, host, port, path, null, null);
367 }
368
369
370 /***
371 * Construct a general URI from the given components.
372 *
373 * @param scheme the scheme string
374 * @param userinfo the userinfo string
375 * @param host the host string
376 * @param port the port number
377 * @param path the path string
378 * @param query the query string
379 * @throws URIException If the new URI cannot be created.
380 * @see #getDefaultProtocolCharset
381 */
382 public URI(String scheme, String userinfo, String host, int port,
383 String path, String query) throws URIException {
384
385 this(scheme, userinfo, host, port, path, query, null);
386 }
387
388
389 /***
390 * Construct a general URI from the given components.
391 *
392 * @param scheme the scheme string
393 * @param userinfo the userinfo string
394 * @param host the host string
395 * @param port the port number
396 * @param path the path string
397 * @param query the query string
398 * @param fragment the fragment string
399 * @throws URIException If the new URI cannot be created.
400 * @see #getDefaultProtocolCharset
401 */
402 public URI(String scheme, String userinfo, String host, int port,
403 String path, String query, String fragment) throws URIException {
404
405 this(scheme, (host == null) ? null
406 : ((userinfo != null) ? userinfo + '@' : "") + host
407 + ((port != -1) ? ":" + port : ""), path, query, fragment);
408 }
409
410
411 /***
412 * Construct a general URI from the given components.
413 *
414 * @param scheme the scheme string
415 * @param host the host string
416 * @param path the path string
417 * @param fragment the fragment string
418 * @throws URIException If the new URI cannot be created.
419 * @see #getDefaultProtocolCharset
420 */
421 public URI(String scheme, String host, String path, String fragment)
422 throws URIException {
423
424 this(scheme, host, path, null, fragment);
425 }
426
427
428 /***
429 * Construct a general URI with the given relative URI string.
430 *
431 * @param base the base URI
432 * @param relative the relative URI string
433 * @throws URIException If the new URI cannot be created.
434 *
435 * @deprecated Use #URI(URI, String, boolean)
436 */
437 public URI(URI base, String relative) throws URIException {
438 this(base, new URI(relative));
439 }
440
441
442 /***
443 * Construct a general URI with the given relative URI string.
444 *
445 * @param base the base URI
446 * @param relative the relative URI string
447 * @param escaped <tt>true</tt> if URI character sequence is in escaped form.
448 * <tt>false</tt> otherwise.
449 *
450 * @throws URIException If the new URI cannot be created.
451 *
452 * @since 3.0
453 */
454 public URI(URI base, String relative, boolean escaped) throws URIException {
455 this(base, new URI(relative, escaped));
456 }
457
458
459 /***
460 * Construct a general URI with the given relative URI.
461 * <p><blockquote><pre>
462 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
463 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
464 * </pre></blockquote><p>
465 * Resolving Relative References to Absolute Form.
466 *
467 * <strong>Examples of Resolving Relative URI References</strong>
468 *
469 * Within an object with a well-defined base URI of
470 * <p><blockquote><pre>
471 * http://a/b/c/d;p?q
472 * </pre></blockquote><p>
473 * the relative URI would be resolved as follows:
474 *
475 * Normal Examples
476 *
477 * <p><blockquote><pre>
478 * g:h = g:h
479 * g = http://a/b/c/g
480 * ./g = http://a/b/c/g
481 * g/ = http://a/b/c/g/
482 * /g = http://a/g
483 * //g = http://g
484 * ?y = http://a/b/c/?y
485 * g?y = http://a/b/c/g?y
486 * #s = (current document)#s
487 * g#s = http://a/b/c/g#s
488 * g?y#s = http://a/b/c/g?y#s
489 * ;x = http://a/b/c/;x
490 * g;x = http://a/b/c/g;x
491 * g;x?y#s = http://a/b/c/g;x?y#s
492 * . = http://a/b/c/
493 * ./ = http://a/b/c/
494 * .. = http://a/b/
495 * ../ = http://a/b/
496 * ../g = http://a/b/g
497 * ../.. = http://a/
498 * ../../ = http://a/
499 * ../../g = http://a/g
500 * </pre></blockquote><p>
501 *
502 * Some URI schemes do not allow a hierarchical syntax matching the
503 * <hier_part> syntax, and thus cannot use relative references.
504 *
505 * @param base the base URI
506 * @param relative the relative URI
507 * @throws URIException If the new URI cannot be created.
508 */
509 public URI(URI base, URI relative) throws URIException {
510
511 if (base._scheme == null) {
512 throw new URIException(URIException.PARSING, "base URI required");
513 }
514 if (base._scheme != null) {
515 this._scheme = base._scheme;
516 this._authority = base._authority;
517 this._is_net_path = base._is_net_path;
518 }
519 if (base._is_opaque_part || relative._is_opaque_part) {
520 this._scheme = base._scheme;
521 this._is_opaque_part = base._is_opaque_part
522 || relative._is_opaque_part;
523 this._opaque = relative._opaque;
524 this._fragment = relative._fragment;
525 this.setURI();
526 return;
527 }
528 boolean schemesEqual = Arrays.equals(base._scheme,relative._scheme);
529 if (relative._scheme != null
530 && (!schemesEqual || relative._authority != null)) {
531 this._scheme = relative._scheme;
532 this._is_net_path = relative._is_net_path;
533 this._authority = relative._authority;
534 if (relative._is_server) {
535 this._is_server = relative._is_server;
536 this._userinfo = relative._userinfo;
537 this._host = relative._host;
538 this._port = relative._port;
539 } else if (relative._is_reg_name) {
540 this._is_reg_name = relative._is_reg_name;
541 }
542 this._is_abs_path = relative._is_abs_path;
543 this._is_rel_path = relative._is_rel_path;
544 this._path = relative._path;
545 } else if (base._authority != null && relative._scheme == null) {
546 this._is_net_path = base._is_net_path;
547 this._authority = base._authority;
548 if (base._is_server) {
549 this._is_server = base._is_server;
550 this._userinfo = base._userinfo;
551 this._host = base._host;
552 this._port = base._port;
553 } else if (base._is_reg_name) {
554 this._is_reg_name = base._is_reg_name;
555 }
556 }
557 if (relative._authority != null) {
558 this._is_net_path = relative._is_net_path;
559 this._authority = relative._authority;
560 if (relative._is_server) {
561 this._is_server = relative._is_server;
562 this._userinfo = relative._userinfo;
563 this._host = relative._host;
564 this._port = relative._port;
565 } else if (relative._is_reg_name) {
566 this._is_reg_name = relative._is_reg_name;
567 }
568 this._is_abs_path = relative._is_abs_path;
569 this._is_rel_path = relative._is_rel_path;
570 this._path = relative._path;
571 }
572
573 if (relative._authority == null
574 && (relative._scheme == null || schemesEqual)) {
575 if ((relative._path == null || relative._path.length == 0)
576 && relative._query == null) {
577
578
579 this._path = base._path;
580 this._query = base._query;
581 } else {
582 this._path = resolvePath(base._path, relative._path);
583 }
584 }
585
586 if (relative._query != null) {
587 this._query = relative._query;
588 }
589
590 if (relative._fragment != null) {
591 this._fragment = relative._fragment;
592 }
593 this.setURI();
594
595
596 parseUriReference(new String(_uri), true);
597 }
598
599
600
601 /*** Version ID for serialization */
602 static final long serialVersionUID = 604752400577948726L;
603
604
605 /***
606 * Cache the hash code for this URI.
607 */
608 protected int hash = 0;
609
610
611 /***
612 * This Uniform Resource Identifier (URI).
613 * The URI is always in an "escaped" form, since escaping or unescaping
614 * a completed URI might change its semantics.
615 */
616 protected char[] _uri = null;
617
618
619 /***
620 * The charset of the protocol used by this URI instance.
621 */
622 protected String protocolCharset = null;
623
624
625 /***
626 * The default charset of the protocol. RFC 2277, 2396
627 */
628 protected static String defaultProtocolCharset = "UTF-8";
629
630
631 /***
632 * The default charset of the document. RFC 2277, 2396
633 * The platform's charset is used for the document by default.
634 */
635 protected static String defaultDocumentCharset = null;
636 protected static String defaultDocumentCharsetByLocale = null;
637 protected static String defaultDocumentCharsetByPlatform = null;
638
639 static {
640 Locale locale = Locale.getDefault();
641
642 if (locale != null) {
643 defaultDocumentCharsetByLocale =
644 LocaleToCharsetMap.getCharset(locale);
645
646 defaultDocumentCharset = defaultDocumentCharsetByLocale;
647 }
648
649 try {
650 defaultDocumentCharsetByPlatform = System.getProperty("file.encoding");
651 } catch (SecurityException ignore) {
652 }
653 if (defaultDocumentCharset == null) {
654
655 defaultDocumentCharset = defaultDocumentCharsetByPlatform;
656 }
657 }
658
659
660 /***
661 * The scheme.
662 */
663 protected char[] _scheme = null;
664
665
666 /***
667 * The opaque.
668 */
669 protected char[] _opaque = null;
670
671
672 /***
673 * The authority.
674 */
675 protected char[] _authority = null;
676
677
678 /***
679 * The userinfo.
680 */
681 protected char[] _userinfo = null;
682
683
684 /***
685 * The host.
686 */
687 protected char[] _host = null;
688
689
690 /***
691 * The port.
692 */
693 protected int _port = -1;
694
695
696 /***
697 * The path.
698 */
699 protected char[] _path = null;
700
701
702 /***
703 * The query.
704 */
705 protected char[] _query = null;
706
707
708 /***
709 * The fragment.
710 */
711 protected char[] _fragment = null;
712
713
714 /***
715 * The root path.
716 */
717 protected static final char[] rootPath = { '/' };
718
719
720
721 /***
722 * The percent "%" character always has the reserved purpose of being the
723 * escape indicator, it must be escaped as "%25" in order to be used as
724 * data within a URI.
725 */
726 protected static final BitSet percent = new BitSet(256);
727
728 static {
729 percent.set('%');
730 }
731
732
733 /***
734 * BitSet for digit.
735 * <p><blockquote><pre>
736 * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
737 * "8" | "9"
738 * </pre></blockquote><p>
739 */
740 protected static final BitSet digit = new BitSet(256);
741
742 static {
743 for (int i = '0'; i <= '9'; i++) {
744 digit.set(i);
745 }
746 }
747
748
749 /***
750 * BitSet for alpha.
751 * <p><blockquote><pre>
752 * alpha = lowalpha | upalpha
753 * </pre></blockquote><p>
754 */
755 protected static final BitSet alpha = new BitSet(256);
756
757 static {
758 for (int i = 'a'; i <= 'z'; i++) {
759 alpha.set(i);
760 }
761 for (int i = 'A'; i <= 'Z'; i++) {
762 alpha.set(i);
763 }
764 }
765
766
767 /***
768 * BitSet for alphanum (join of alpha & digit).
769 * <p><blockquote><pre>
770 * alphanum = alpha | digit
771 * </pre></blockquote><p>
772 */
773 protected static final BitSet alphanum = new BitSet(256);
774
775 static {
776 alphanum.or(alpha);
777 alphanum.or(digit);
778 }
779
780
781 /***
782 * BitSet for hex.
783 * <p><blockquote><pre>
784 * hex = digit | "A" | "B" | "C" | "D" | "E" | "F" |
785 * "a" | "b" | "c" | "d" | "e" | "f"
786 * </pre></blockquote><p>
787 */
788 protected static final BitSet hex = new BitSet(256);
789
790 static {
791 hex.or(digit);
792 for (int i = 'a'; i <= 'f'; i++) {
793 hex.set(i);
794 }
795 for (int i = 'A'; i <= 'F'; i++) {
796 hex.set(i);
797 }
798 }
799
800
801 /***
802 * BitSet for escaped.
803 * <p><blockquote><pre>
804 * escaped = "%" hex hex
805 * </pre></blockquote><p>
806 */
807 protected static final BitSet escaped = new BitSet(256);
808
809 static {
810 escaped.or(percent);
811 escaped.or(hex);
812 }
813
814
815 /***
816 * BitSet for mark.
817 * <p><blockquote><pre>
818 * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" |
819 * "(" | ")"
820 * </pre></blockquote><p>
821 */
822 protected static final BitSet mark = new BitSet(256);
823
824 static {
825 mark.set('-');
826 mark.set('_');
827 mark.set('.');
828 mark.set('!');
829 mark.set('~');
830 mark.set('*');
831 mark.set('\'');
832 mark.set('(');
833 mark.set(')');
834 }
835
836
837 /***
838 * Data characters that are allowed in a URI but do not have a reserved
839 * purpose are called unreserved.
840 * <p><blockquote><pre>
841 * unreserved = alphanum | mark
842 * </pre></blockquote><p>
843 */
844 protected static final BitSet unreserved = new BitSet(256);
845
846 static {
847 unreserved.or(alphanum);
848 unreserved.or(mark);
849 }
850
851
852 /***
853 * BitSet for reserved.
854 * <p><blockquote><pre>
855 * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
856 * "$" | ","
857 * </pre></blockquote><p>
858 */
859 protected static final BitSet reserved = new BitSet(256);
860
861 static {
862 reserved.set(';');
863 reserved.set('/');
864 reserved.set('?');
865 reserved.set(':');
866 reserved.set('@');
867 reserved.set('&');
868 reserved.set('=');
869 reserved.set('+');
870 reserved.set('$');
871 reserved.set(',');
872 }
873
874
875 /***
876 * BitSet for uric.
877 * <p><blockquote><pre>
878 * uric = reserved | unreserved | escaped
879 * </pre></blockquote><p>
880 */
881 protected static final BitSet uric = new BitSet(256);
882
883 static {
884 uric.or(reserved);
885 uric.or(unreserved);
886 uric.or(escaped);
887 }
888
889
890 /***
891 * BitSet for fragment (alias for uric).
892 * <p><blockquote><pre>
893 * fragment = *uric
894 * </pre></blockquote><p>
895 */
896 protected static final BitSet fragment = uric;
897
898
899 /***
900 * BitSet for query (alias for uric).
901 * <p><blockquote><pre>
902 * query = *uric
903 * </pre></blockquote><p>
904 */
905 protected static final BitSet query = uric;
906
907
908 /***
909 * BitSet for pchar.
910 * <p><blockquote><pre>
911 * pchar = unreserved | escaped |
912 * ":" | "@" | "&" | "=" | "+" | "$" | ","
913 * </pre></blockquote><p>
914 */
915 protected static final BitSet pchar = new BitSet(256);
916
917 static {
918 pchar.or(unreserved);
919 pchar.or(escaped);
920 pchar.set(':');
921 pchar.set('@');
922 pchar.set('&');
923 pchar.set('=');
924 pchar.set('+');
925 pchar.set('$');
926 pchar.set(',');
927 }
928
929
930 /***
931 * BitSet for param (alias for pchar).
932 * <p><blockquote><pre>
933 * param = *pchar
934 * </pre></blockquote><p>
935 */
936 protected static final BitSet param = pchar;
937
938
939 /***
940 * BitSet for segment.
941 * <p><blockquote><pre>
942 * segment = *pchar *( ";" param )
943 * </pre></blockquote><p>
944 */
945 protected static final BitSet segment = new BitSet(256);
946
947 static {
948 segment.or(pchar);
949 segment.set(';');
950 segment.or(param);
951 }
952
953
954 /***
955 * BitSet for path segments.
956 * <p><blockquote><pre>
957 * path_segments = segment *( "/" segment )
958 * </pre></blockquote><p>
959 */
960 protected static final BitSet path_segments = new BitSet(256);
961
962 static {
963 path_segments.set('/');
964 path_segments.or(segment);
965 }
966
967
968 /***
969 * URI absolute path.
970 * <p><blockquote><pre>
971 * abs_path = "/" path_segments
972 * </pre></blockquote><p>
973 */
974 protected static final BitSet abs_path = new BitSet(256);
975
976 static {
977 abs_path.set('/');
978 abs_path.or(path_segments);
979 }
980
981
982 /***
983 * URI bitset for encoding typical non-slash characters.
984 * <p><blockquote><pre>
985 * uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
986 * "&" | "=" | "+" | "$" | ","
987 * </pre></blockquote><p>
988 */
989 protected static final BitSet uric_no_slash = new BitSet(256);
990
991 static {
992 uric_no_slash.or(unreserved);
993 uric_no_slash.or(escaped);
994 uric_no_slash.set(';');
995 uric_no_slash.set('?');
996 uric_no_slash.set(';');
997 uric_no_slash.set('@');
998 uric_no_slash.set('&');
999 uric_no_slash.set('=');
1000 uric_no_slash.set('+');
1001 uric_no_slash.set('$');
1002 uric_no_slash.set(',');
1003 }
1004
1005
1006 /***
1007 * URI bitset that combines uric_no_slash and uric.
1008 * <p><blockquote><pre>
1009 * opaque_part = uric_no_slash *uric
1010 * </pre></blockquote><p>
1011 */
1012 protected static final BitSet opaque_part = new BitSet(256);
1013
1014 static {
1015
1016 opaque_part.or(uric_no_slash);
1017 opaque_part.or(uric);
1018 }
1019
1020
1021 /***
1022 * URI bitset that combines absolute path and opaque part.
1023 * <p><blockquote><pre>
1024 * path = [ abs_path | opaque_part ]
1025 * </pre></blockquote><p>
1026 */
1027 protected static final BitSet path = new BitSet(256);
1028
1029 static {
1030 path.or(abs_path);
1031 path.or(opaque_part);
1032 }
1033
1034
1035 /***
1036 * Port, a logical alias for digit.
1037 */
1038 protected static final BitSet port = digit;
1039
1040
1041 /***
1042 * Bitset that combines digit and dot fo IPv$address.
1043 * <p><blockquote><pre>
1044 * IPv4address = 1*digit "." 1*digit "." 1*digit "." 1*digit
1045 * </pre></blockquote><p>
1046 */
1047 protected static final BitSet IPv4address = new BitSet(256);
1048
1049 static {
1050 IPv4address.or(digit);
1051 IPv4address.set('.');
1052 }
1053
1054
1055 /***
1056 * RFC 2373.
1057 * <p><blockquote><pre>
1058 * IPv6address = hexpart [ ":" IPv4address ]
1059 * </pre></blockquote><p>
1060 */
1061 protected static final BitSet IPv6address = new BitSet(256);
1062
1063 static {
1064 IPv6address.or(hex);
1065 IPv6address.set(':');
1066 IPv6address.or(IPv4address);
1067 }
1068
1069
1070 /***
1071 * RFC 2732, 2373.
1072 * <p><blockquote><pre>
1073 * IPv6reference = "[" IPv6address "]"
1074 * </pre></blockquote><p>
1075 */
1076 protected static final BitSet IPv6reference = new BitSet(256);
1077
1078 static {
1079 IPv6reference.set('[');
1080 IPv6reference.or(IPv6address);
1081 IPv6reference.set(']');
1082 }
1083
1084
1085 /***
1086 * BitSet for toplabel.
1087 * <p><blockquote><pre>
1088 * toplabel = alpha | alpha *( alphanum | "-" ) alphanum
1089 * </pre></blockquote><p>
1090 */
1091 protected static final BitSet toplabel = new BitSet(256);
1092
1093 static {
1094 toplabel.or(alphanum);
1095 toplabel.set('-');
1096 }
1097
1098
1099 /***
1100 * BitSet for domainlabel.
1101 * <p><blockquote><pre>
1102 * domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
1103 * </pre></blockquote><p>
1104 */
1105 protected static final BitSet domainlabel = toplabel;
1106
1107
1108 /***
1109 * BitSet for hostname.
1110 * <p><blockquote><pre>
1111 * hostname = *( domainlabel "." ) toplabel [ "." ]
1112 * </pre></blockquote><p>
1113 */
1114 protected static final BitSet hostname = new BitSet(256);
1115
1116 static {
1117 hostname.or(toplabel);
1118
1119 hostname.set('.');
1120 }
1121
1122
1123 /***
1124 * BitSet for host.
1125 * <p><blockquote><pre>
1126 * host = hostname | IPv4address | IPv6reference
1127 * </pre></blockquote><p>
1128 */
1129 protected static final BitSet host = new BitSet(256);
1130
1131 static {
1132 host.or(hostname);
1133
1134 host.or(IPv6reference);
1135 }
1136
1137
1138 /***
1139 * BitSet for hostport.
1140 * <p><blockquote><pre>
1141 * hostport = host [ ":" port ]
1142 * </pre></blockquote><p>
1143 */
1144 protected static final BitSet hostport = new BitSet(256);
1145
1146 static {
1147 hostport.or(host);
1148 hostport.set(':');
1149 hostport.or(port);
1150 }
1151
1152
1153 /***
1154 * Bitset for userinfo.
1155 * <p><blockquote><pre>
1156 * userinfo = *( unreserved | escaped |
1157 * ";" | ":" | "&" | "=" | "+" | "$" | "," )
1158 * </pre></blockquote><p>
1159 */
1160 protected static final BitSet userinfo = new BitSet(256);
1161
1162 static {
1163 userinfo.or(unreserved);
1164 userinfo.or(escaped);
1165 userinfo.set(';');
1166 userinfo.set(':');
1167 userinfo.set('&');
1168 userinfo.set('=');
1169 userinfo.set('+');
1170 userinfo.set('$');
1171 userinfo.set(',');
1172 }
1173
1174
1175 /***
1176 * BitSet for within the userinfo component like user and password.
1177 */
1178 public static final BitSet within_userinfo = new BitSet(256);
1179
1180 static {
1181 within_userinfo.or(userinfo);
1182 within_userinfo.clear(';');
1183 within_userinfo.clear(':');
1184 within_userinfo.clear('@');
1185 within_userinfo.clear('?');
1186 within_userinfo.clear('/');
1187 }
1188
1189
1190 /***
1191 * Bitset for server.
1192 * <p><blockquote><pre>
1193 * server = [ [ userinfo "@" ] hostport ]
1194 * </pre></blockquote><p>
1195 */
1196 protected static final BitSet server = new BitSet(256);
1197
1198 static {
1199 server.or(userinfo);
1200 server.set('@');
1201 server.or(hostport);
1202 }
1203
1204
1205 /***
1206 * BitSet for reg_name.
1207 * <p><blockquote><pre>
1208 * reg_name = 1*( unreserved | escaped | "$" | "," |
1209 * ";" | ":" | "@" | "&" | "=" | "+" )
1210 * </pre></blockquote><p>
1211 */
1212 protected static final BitSet reg_name = new BitSet(256);
1213
1214 static {
1215 reg_name.or(unreserved);
1216 reg_name.or(escaped);
1217 reg_name.set('$');
1218 reg_name.set(',');
1219 reg_name.set(';');
1220 reg_name.set(':');
1221 reg_name.set('@');
1222 reg_name.set('&');
1223 reg_name.set('=');
1224 reg_name.set('+');
1225 }
1226
1227
1228 /***
1229 * BitSet for authority.
1230 * <p><blockquote><pre>
1231 * authority = server | reg_name
1232 * </pre></blockquote><p>
1233 */
1234 protected static final BitSet authority = new BitSet(256);
1235
1236 static {
1237 authority.or(server);
1238 authority.or(reg_name);
1239 }
1240
1241
1242 /***
1243 * BitSet for scheme.
1244 * <p><blockquote><pre>
1245 * scheme = alpha *( alpha | digit | "+" | "-" | "." )
1246 * </pre></blockquote><p>
1247 */
1248 protected static final BitSet scheme = new BitSet(256);
1249
1250 static {
1251 scheme.or(alpha);
1252 scheme.or(digit);
1253 scheme.set('+');
1254 scheme.set('-');
1255 scheme.set('.');
1256 }
1257
1258
1259 /***
1260 * BitSet for rel_segment.
1261 * <p><blockquote><pre>
1262 * rel_segment = 1*( unreserved | escaped |
1263 * ";" | "@" | "&" | "=" | "+" | "$" | "," )
1264 * </pre></blockquote><p>
1265 */
1266 protected static final BitSet rel_segment = new BitSet(256);
1267
1268 static {
1269 rel_segment.or(unreserved);
1270 rel_segment.or(escaped);
1271 rel_segment.set(';');
1272 rel_segment.set('@');
1273 rel_segment.set('&');
1274 rel_segment.set('=');
1275 rel_segment.set('+');
1276 rel_segment.set('$');
1277 rel_segment.set(',');
1278 }
1279
1280
1281 /***
1282 * BitSet for rel_path.
1283 * <p><blockquote><pre>
1284 * rel_path = rel_segment [ abs_path ]
1285 * </pre></blockquote><p>
1286 */
1287 protected static final BitSet rel_path = new BitSet(256);
1288
1289 static {
1290 rel_path.or(rel_segment);
1291 rel_path.or(abs_path);
1292 }
1293
1294
1295 /***
1296 * BitSet for net_path.
1297 * <p><blockquote><pre>
1298 * net_path = "//" authority [ abs_path ]
1299 * </pre></blockquote><p>
1300 */
1301 protected static final BitSet net_path = new BitSet(256);
1302
1303 static {
1304 net_path.set('/');
1305 net_path.or(authority);
1306 net_path.or(abs_path);
1307 }
1308
1309
1310 /***
1311 * BitSet for hier_part.
1312 * <p><blockquote><pre>
1313 * hier_part = ( net_path | abs_path ) [ "?" query ]
1314 * </pre></blockquote><p>
1315 */
1316 protected static final BitSet hier_part = new BitSet(256);
1317
1318 static {
1319 hier_part.or(net_path);
1320 hier_part.or(abs_path);
1321
1322 hier_part.or(query);
1323 }
1324
1325
1326 /***
1327 * BitSet for relativeURI.
1328 * <p><blockquote><pre>
1329 * relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
1330 * </pre></blockquote><p>
1331 */
1332 protected static final BitSet relativeURI = new BitSet(256);
1333
1334 static {
1335 relativeURI.or(net_path);
1336 relativeURI.or(abs_path);
1337 relativeURI.or(rel_path);
1338
1339 relativeURI.or(query);
1340 }
1341
1342
1343 /***
1344 * BitSet for absoluteURI.
1345 * <p><blockquote><pre>
1346 * absoluteURI = scheme ":" ( hier_part | opaque_part )
1347 * </pre></blockquote><p>
1348 */
1349 protected static final BitSet absoluteURI = new BitSet(256);
1350
1351 static {
1352 absoluteURI.or(scheme);
1353 absoluteURI.set(':');
1354 absoluteURI.or(hier_part);
1355 absoluteURI.or(opaque_part);
1356 }
1357
1358
1359 /***
1360 * BitSet for URI-reference.
1361 * <p><blockquote><pre>
1362 * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
1363 * </pre></blockquote><p>
1364 */
1365 protected static final BitSet URI_reference = new BitSet(256);
1366
1367 static {
1368 URI_reference.or(absoluteURI);
1369 URI_reference.or(relativeURI);
1370 URI_reference.set('#');
1371 URI_reference.or(fragment);
1372 }
1373
1374
1375
1376
1377 /***
1378 * BitSet for control.
1379 */
1380 public static final BitSet control = new BitSet(256);
1381
1382 static {
1383 for (int i = 0; i <= 0x1F; i++) {
1384 control.set(i);
1385 }
1386 control.set(0x7F);
1387 }
1388
1389 /***
1390 * BitSet for space.
1391 */
1392 public static final BitSet space = new BitSet(256);
1393
1394 static {
1395 space.set(0x20);
1396 }
1397
1398
1399 /***
1400 * BitSet for delims.
1401 */
1402 public static final BitSet delims = new BitSet(256);
1403
1404 static {
1405 delims.set('<');
1406 delims.set('>');
1407 delims.set('#');
1408 delims.set('%');
1409 delims.set('"');
1410 }
1411
1412
1413 /***
1414 * BitSet for unwise.
1415 */
1416 public static final BitSet unwise = new BitSet(256);
1417
1418 static {
1419 unwise.set('{');
1420 unwise.set('}');
1421 unwise.set('|');
1422 unwise.set('//');
1423 unwise.set('^');
1424 unwise.set('[');
1425 unwise.set(']');
1426 unwise.set('`');
1427 }
1428
1429
1430 /***
1431 * Disallowed rel_path before escaping.
1432 */
1433 public static final BitSet disallowed_rel_path = new BitSet(256);
1434
1435 static {
1436 disallowed_rel_path.or(uric);
1437 disallowed_rel_path.andNot(rel_path);
1438 }
1439
1440
1441 /***
1442 * Disallowed opaque_part before escaping.
1443 */
1444 public static final BitSet disallowed_opaque_part = new BitSet(256);
1445
1446 static {
1447 disallowed_opaque_part.or(uric);
1448 disallowed_opaque_part.andNot(opaque_part);
1449 }
1450
1451
1452
1453 /***
1454 * Those characters that are allowed for the authority component.
1455 */
1456 public static final BitSet allowed_authority = new BitSet(256);
1457
1458 static {
1459 allowed_authority.or(authority);
1460 allowed_authority.clear('%');
1461 }
1462
1463
1464 /***
1465 * Those characters that are allowed for the opaque_part.
1466 */
1467 public static final BitSet allowed_opaque_part = new BitSet(256);
1468
1469 static {
1470 allowed_opaque_part.or(opaque_part);
1471 allowed_opaque_part.clear('%');
1472 }
1473
1474
1475 /***
1476 * Those characters that are allowed for the reg_name.
1477 */
1478 public static final BitSet allowed_reg_name = new BitSet(256);
1479
1480 static {
1481 allowed_reg_name.or(reg_name);
1482
1483 allowed_reg_name.clear('%');
1484 }
1485
1486
1487 /***
1488 * Those characters that are allowed for the userinfo component.
1489 */
1490 public static final BitSet allowed_userinfo = new BitSet(256);
1491
1492 static {
1493 allowed_userinfo.or(userinfo);
1494
1495 allowed_userinfo.clear('%');
1496 }
1497
1498
1499 /***
1500 * Those characters that are allowed for within the userinfo component.
1501 */
1502 public static final BitSet allowed_within_userinfo = new BitSet(256);
1503
1504 static {
1505 allowed_within_userinfo.or(within_userinfo);
1506 allowed_within_userinfo.clear('%');
1507 }
1508
1509
1510 /***
1511 * Those characters that are allowed for the IPv6reference component.
1512 * The characters '[', ']' in IPv6reference should be excluded.
1513 */
1514 public static final BitSet allowed_IPv6reference = new BitSet(256);
1515
1516 static {
1517 allowed_IPv6reference.or(IPv6reference);
1518
1519 allowed_IPv6reference.clear('[');
1520 allowed_IPv6reference.clear(']');
1521 }
1522
1523
1524 /***
1525 * Those characters that are allowed for the host component.
1526 * The characters '[', ']' in IPv6reference should be excluded.
1527 */
1528 public static final BitSet allowed_host = new BitSet(256);
1529
1530 static {
1531 allowed_host.or(hostname);
1532 allowed_host.or(allowed_IPv6reference);
1533 }
1534
1535
1536 /***
1537 * Those characters that are allowed for the authority component.
1538 */
1539 public static final BitSet allowed_within_authority = new BitSet(256);
1540
1541 static {
1542 allowed_within_authority.or(server);
1543 allowed_within_authority.or(reg_name);
1544 allowed_within_authority.clear(';');
1545 allowed_within_authority.clear(':');
1546 allowed_within_authority.clear('@');
1547 allowed_within_authority.clear('?');
1548 allowed_within_authority.clear('/');
1549 }
1550
1551
1552 /***
1553 * Those characters that are allowed for the abs_path.
1554 */
1555 public static final BitSet allowed_abs_path = new BitSet(256);
1556
1557 static {
1558 allowed_abs_path.or(abs_path);
1559
1560 allowed_abs_path.andNot(percent);
1561 allowed_abs_path.clear('+');
1562 }
1563
1564
1565 /***
1566 * Those characters that are allowed for the rel_path.
1567 */
1568 public static final BitSet allowed_rel_path = new BitSet(256);
1569
1570 static {
1571 allowed_rel_path.or(rel_path);
1572 allowed_rel_path.clear('%');
1573 allowed_rel_path.clear('+');
1574 }
1575
1576
1577 /***
1578 * Those characters that are allowed within the path.
1579 */
1580 public static final BitSet allowed_within_path = new BitSet(256);
1581
1582 static {
1583 allowed_within_path.or(abs_path);
1584 allowed_within_path.clear('/');
1585 allowed_within_path.clear(';');
1586 allowed_within_path.clear('=');
1587 allowed_within_path.clear('?');
1588 }
1589
1590
1591 /***
1592 * Those characters that are allowed for the query component.
1593 */
1594 public static final BitSet allowed_query = new BitSet(256);
1595
1596 static {
1597 allowed_query.or(uric);
1598 allowed_query.clear('%');
1599 }
1600
1601
1602 /***
1603 * Those characters that are allowed within the query component.
1604 */
1605 public static final BitSet allowed_within_query = new BitSet(256);
1606
1607 static {
1608 allowed_within_query.or(allowed_query);
1609 allowed_within_query.andNot(reserved);
1610 }
1611
1612
1613 /***
1614 * Those characters that are allowed for the fragment component.
1615 */
1616 public static final BitSet allowed_fragment = new BitSet(256);
1617
1618 static {
1619 allowed_fragment.or(uric);
1620 allowed_fragment.clear('%');
1621 }
1622
1623
1624
1625
1626
1627
1628
1629 protected boolean _is_hier_part;
1630 protected boolean _is_opaque_part;
1631
1632
1633 protected boolean _is_net_path;
1634 protected boolean _is_abs_path;
1635 protected boolean _is_rel_path;
1636
1637
1638 protected boolean _is_reg_name;
1639 protected boolean _is_server;
1640
1641
1642 protected boolean _is_hostname;
1643 protected boolean _is_IPv4address;
1644 protected boolean _is_IPv6reference;
1645
1646
1647
1648 /***
1649 * Encodes URI string.
1650 *
1651 * This is a two mapping, one from original characters to octets, and
1652 * subsequently a second from octets to URI characters:
1653 * <p><blockquote><pre>
1654 * original character sequence->octet sequence->URI character sequence
1655 * </pre></blockquote><p>
1656 *
1657 * An escaped octet is encoded as a character triplet, consisting of the
1658 * percent character "%" followed by the two hexadecimal digits
1659 * representing the octet code. For example, "%20" is the escaped
1660 * encoding for the US-ASCII space character.
1661 * <p>
1662 * Conversion from the local filesystem character set to UTF-8 will
1663 * normally involve a two step process. First convert the local character
1664 * set to the UCS; then convert the UCS to UTF-8.
1665 * The first step in the process can be performed by maintaining a mapping
1666 * table that includes the local character set code and the corresponding
1667 * UCS code.
1668 * The next step is to convert the UCS character code to the UTF-8 encoding.
1669 * <p>
1670 * Mapping between vendor codepages can be done in a very similar manner
1671 * as described above.
1672 * <p>
1673 * The only time escape encodings can allowedly be made is when a URI is
1674 * being created from its component parts. The escape and validate methods
1675 * are internally performed within this method.
1676 *
1677 * @param original the original character sequence
1678 * @param allowed those characters that are allowed within a component
1679 * @param charset the protocol charset
1680 * @return URI character sequence
1681 * @throws URIException null component or unsupported character encoding
1682 */
1683
1684 protected static char[] encode(String original, BitSet allowed,
1685 String charset) throws URIException {
1686 if (original == null) {
1687 throw new IllegalArgumentException("Original string may not be null");
1688 }
1689 if (allowed == null) {
1690 throw new IllegalArgumentException("Allowed bitset may not be null");
1691 }
1692 byte[] rawdata = URLCodec.encodeUrl(allowed, EncodingUtil.getBytes(original, charset));
1693 return EncodingUtil.getAsciiString(rawdata).toCharArray();
1694 }
1695
1696 /***
1697 * Decodes URI encoded string.
1698 *
1699 * This is a two mapping, one from URI characters to octets, and
1700 * subsequently a second from octets to original characters:
1701 * <p><blockquote><pre>
1702 * URI character sequence->octet sequence->original character sequence
1703 * </pre></blockquote><p>
1704 *
1705 * A URI must be separated into its components before the escaped
1706 * characters within those components can be allowedly decoded.
1707 * <p>
1708 * Notice that there is a chance that URI characters that are non UTF-8
1709 * may be parsed as valid UTF-8. A recent non-scientific analysis found
1710 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1711 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1712 * false reading.
1713 * <p>
1714 * The percent "%" character always has the reserved purpose of being
1715 * the escape indicator, it must be escaped as "%25" in order to be used
1716 * as data within a URI.
1717 * <p>
1718 * The unescape method is internally performed within this method.
1719 *
1720 * @param component the URI character sequence
1721 * @param charset the protocol charset
1722 * @return original character sequence
1723 * @throws URIException incomplete trailing escape pattern or unsupported
1724 * character encoding
1725 */
1726 protected static String decode(char[] component, String charset)
1727 throws URIException {
1728 if (component == null) {
1729 throw new IllegalArgumentException("Component array of chars may not be null");
1730 }
1731 return decode(new String(component), charset);
1732 }
1733
1734 /***
1735 * Decodes URI encoded string.
1736 *
1737 * This is a two mapping, one from URI characters to octets, and
1738 * subsequently a second from octets to original characters:
1739 * <p><blockquote><pre>
1740 * URI character sequence->octet sequence->original character sequence
1741 * </pre></blockquote><p>
1742 *
1743 * A URI must be separated into its components before the escaped
1744 * characters within those components can be allowedly decoded.
1745 * <p>
1746 * Notice that there is a chance that URI characters that are non UTF-8
1747 * may be parsed as valid UTF-8. A recent non-scientific analysis found
1748 * that EUC encoded Japanese words had a 2.7% false reading; SJIS had a
1749 * 0.0005% false reading; other encoding such as ASCII or KOI-8 have a 0%
1750 * false reading.
1751 * <p>
1752 * The percent "%" character always has the reserved purpose of being
1753 * the escape indicator, it must be escaped as "%25" in order to be used
1754 * as data within a URI.
1755 * <p>
1756 * The unescape method is internally performed within this method.
1757 *
1758 * @param component the URI character sequence
1759 * @param charset the protocol charset
1760 * @return original character sequence
1761 * @throws URIException incomplete trailing escape pattern or unsupported
1762 * character encoding
1763 *
1764 * @since 3.0
1765 */
1766 protected static String decode(String component, String charset)
1767 throws URIException {
1768 if (component == null) {
1769 throw new IllegalArgumentException("Component array of chars may not be null");
1770 }
1771 byte[] rawdata = null;
1772 try {
1773 rawdata = URLCodec.decodeUrl(EncodingUtil.getAsciiBytes(component));
1774 } catch (DecoderException e) {
1775 throw new URIException(e.getMessage());
1776 }
1777 return EncodingUtil.getString(rawdata, charset);
1778 }
1779 /***
1780 * Pre-validate the unescaped URI string within a specific component.
1781 *
1782 * @param component the component string within the component
1783 * @param disallowed those characters disallowed within the component
1784 * @return if true, it doesn't have the disallowed characters
1785 * if false, the component is undefined or an incorrect one
1786 */
1787 protected boolean prevalidate(String component, BitSet disallowed) {
1788
1789 if (component == null) {
1790 return false;
1791 }
1792 char[] target = component.toCharArray();
1793 for (int i = 0; i < target.length; i++) {
1794 if (disallowed.get(target[i])) {
1795 return false;
1796 }
1797 }
1798 return true;
1799 }
1800
1801
1802 /***
1803 * Validate the URI characters within a specific component.
1804 * The component must be performed after escape encoding. Or it doesn't
1805 * include escaped characters.
1806 *
1807 * @param component the characters sequence within the component
1808 * @param generous those characters that are allowed within a component
1809 * @return if true, it's the correct URI character sequence
1810 */
1811 protected boolean validate(char[] component, BitSet generous) {
1812
1813 return validate(component, 0, -1, generous);
1814 }
1815
1816
1817 /***
1818 * Validate the URI characters within a specific component.
1819 * The component must be performed after escape encoding. Or it doesn't
1820 * include escaped characters.
1821 * <p>
1822 * It's not that much strict, generous. The strict validation might be
1823 * performed before being called this method.
1824 *
1825 * @param component the characters sequence within the component
1826 * @param soffset the starting offset of the given component
1827 * @param eoffset the ending offset of the given component
1828 * if -1, it means the length of the component
1829 * @param generous those characters that are allowed within a component
1830 * @return if true, it's the correct URI character sequence
1831 */
1832 protected boolean validate(char[] component, int soffset, int eoffset,
1833 BitSet generous) {
1834
1835 if (eoffset == -1) {
1836 eoffset = component.length - 1;
1837 }
1838 for (int i = soffset; i <= eoffset; i++) {
1839 if (!generous.get(component[i])) {
1840 return false;
1841 }
1842 }
1843 return true;
1844 }
1845
1846
1847 /***
1848 * In order to avoid any possilbity of conflict with non-ASCII characters,
1849 * Parse a URI reference as a <code>String</code> with the character
1850 * encoding of the local system or the document.
1851 * <p>
1852 * The following line is the regular expression for breaking-down a URI
1853 * reference into its components.
1854 * <p><blockquote><pre>
1855 * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
1856 * 12 3 4 5 6 7 8 9
1857 * </pre></blockquote><p>
1858 * For example, matching the above expression to
1859 * http://jakarta.apache.org/ietf/uri/#Related
1860 * results in the following subexpression matches:
1861 * <p><blockquote><pre>
1862 * $1 = http:
1863 * scheme = $2 = http
1864 * $3 = //jakarta.apache.org
1865 * authority = $4 = jakarta.apache.org
1866 * path = $5 = /ietf/uri/
1867 * $6 = <undefined>
1868 * query = $7 = <undefined>
1869 * $8 = #Related
1870 * fragment = $9 = Related
1871 * </pre></blockquote><p>
1872 *
1873 * @param original the original character sequence
1874 * @param escaped <code>true</code> if <code>original</code> is escaped
1875 * @throws URIException If an error occurs.
1876 */
1877 protected void parseUriReference(String original, boolean escaped)
1878 throws URIException {
1879
1880
1881 if (original == null) {
1882 throw new URIException("URI-Reference required");
1883 }
1884
1885
1886
1887
1888 String tmp = original.trim();
1889
1890
1891
1892
1893
1894 int length = tmp.length();
1895
1896
1897
1898
1899 if (length > 0) {
1900 char[] firstDelimiter = { tmp.charAt(0) };
1901 if (validate(firstDelimiter, delims)) {
1902 if (length >= 2) {
1903 char[] lastDelimiter = { tmp.charAt(length - 1) };
1904 if (validate(lastDelimiter, delims)) {
1905 tmp = tmp.substring(1, length - 1);
1906 length = length - 2;
1907 }
1908 }
1909 }
1910 }
1911
1912
1913
1914
1915 int from = 0;
1916
1917
1918
1919
1920 boolean isStartedFromPath = false;
1921 int atColon = tmp.indexOf(':');
1922 int atSlash = tmp.indexOf('/');
1923 if ((atColon <= 0 && !tmp.startsWith("//"))
1924 || (atSlash >= 0 && atSlash < atColon)) {
1925 isStartedFromPath = true;
1926 }
1927
1928
1929
1930
1931
1932
1933
1934 int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from);
1935 if (at == -1) {
1936 at = 0;
1937 }
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947 if (at > 0 && at < length && tmp.charAt(at) == ':') {
1948 char[] target = tmp.substring(0, at).toLowerCase().toCharArray();
1949 if (validate(target, scheme)) {
1950 _scheme = target;
1951 } else {
1952 throw new URIException("incorrect scheme");
1953 }
1954 from = ++at;
1955 }
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966 _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false;
1967 if (0 <= at && at < length && tmp.charAt(at) == '/') {
1968
1969 _is_hier_part = true;
1970 if (at + 2 < length && tmp.charAt(at + 1) == '/'
1971 && !isStartedFromPath) {
1972
1973 int next = indexFirstOf(tmp, "/?#", at + 2);
1974 if (next == -1) {
1975 next = (tmp.substring(at + 2).length() == 0) ? at + 2
1976 : tmp.length();
1977 }
1978 parseAuthority(tmp.substring(at + 2, next), escaped);
1979 from = at = next;
1980
1981 _is_net_path = true;
1982 }
1983 if (from == at) {
1984
1985 _is_abs_path = true;
1986 }
1987 }
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997 if (from < length) {
1998
1999 int next = indexFirstOf(tmp, "?#", from);
2000 if (next == -1) {
2001 next = tmp.length();
2002 }
2003 if (!_is_abs_path) {
2004 if (!escaped
2005 && prevalidate(tmp.substring(from, next), disallowed_rel_path)
2006 || escaped
2007 && validate(tmp.substring(from, next).toCharArray(), rel_path)) {
2008
2009 _is_rel_path = true;
2010 } else if (!escaped
2011 && prevalidate(tmp.substring(from, next), disallowed_opaque_part)
2012 || escaped
2013 && validate(tmp.substring(from, next).toCharArray(), opaque_part)) {
2014
2015 _is_opaque_part = true;
2016 } else {
2017
2018 _path = null;
2019 }
2020 }
2021 String s = tmp.substring(from, next);
2022 if (escaped) {
2023 setRawPath(s.toCharArray());
2024 } else {
2025 setPath(s);
2026 }
2027 at = next;
2028 }
2029
2030
2031 String charset = getProtocolCharset();
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041 if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') {
2042 int next = tmp.indexOf('#', at + 1);
2043 if (next == -1) {
2044 next = tmp.length();
2045 }
2046 if (escaped) {
2047 _query = tmp.substring(at + 1, next).toCharArray();
2048 if (!validate(_query, uric)) {
2049 throw new URIException("Invalid query");
2050 }
2051 } else {
2052 _query = encode(tmp.substring(at + 1, next), allowed_query, charset);
2053 }
2054 at = next;
2055 }
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065 if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') {
2066 if (at + 1 == length) {
2067 _fragment = "".toCharArray();
2068 } else {
2069 _fragment = (escaped) ? tmp.substring(at + 1).toCharArray()
2070 : encode(tmp.substring(at + 1), allowed_fragment, charset);
2071 }
2072 }
2073
2074
2075 setURI();
2076 }
2077
2078
2079 /***
2080 * Get the earlier index that to be searched for the first occurrance in
2081 * one of any of the given string.
2082 *
2083 * @param s the string to be indexed
2084 * @param delims the delimiters used to index
2085 * @return the earlier index if there are delimiters
2086 */
2087 protected int indexFirstOf(String s, String delims) {
2088 return indexFirstOf(s, delims, -1);
2089 }
2090
2091
2092 /***
2093 * Get the earlier index that to be searched for the first occurrance in
2094 * one of any of the given string.
2095 *
2096 * @param s the string to be indexed
2097 * @param delims the delimiters used to index
2098 * @param offset the from index
2099 * @return the earlier index if there are delimiters
2100 */
2101 protected int indexFirstOf(String s, String delims, int offset) {
2102 if (s == null || s.length() == 0) {
2103 return -1;
2104 }
2105 if (delims == null || delims.length() == 0) {
2106 return -1;
2107 }
2108
2109 if (offset < 0) {
2110 offset = 0;
2111 } else if (offset > s.length()) {
2112 return -1;
2113 }
2114
2115 int min = s.length();
2116 char[] delim = delims.toCharArray();
2117 for (int i = 0; i < delim.length; i++) {
2118 int at = s.indexOf(delim[i], offset);
2119 if (at >= 0 && at < min) {
2120 min = at;
2121 }
2122 }
2123 return (min == s.length()) ? -1 : min;
2124 }
2125
2126
2127 /***
2128 * Get the earlier index that to be searched for the first occurrance in
2129 * one of any of the given array.
2130 *
2131 * @param s the character array to be indexed
2132 * @param delim the delimiter used to index
2133 * @return the ealier index if there are a delimiter
2134 */
2135 protected int indexFirstOf(char[] s, char delim) {
2136 return indexFirstOf(s, delim, 0);
2137 }
2138
2139
2140 /***
2141 * Get the earlier index that to be searched for the first occurrance in
2142 * one of any of the given array.
2143 *
2144 * @param s the character array to be indexed
2145 * @param delim the delimiter used to index
2146 * @param offset The offset.
2147 * @return the ealier index if there is a delimiter
2148 */
2149 protected int indexFirstOf(char[] s, char delim, int offset) {
2150 if (s == null || s.length == 0) {
2151 return -1;
2152 }
2153
2154 if (offset < 0) {
2155 offset = 0;
2156 } else if (offset > s.length) {
2157 return -1;
2158 }
2159 for (int i = offset; i < s.length; i++) {
2160 if (s[i] == delim) {
2161 return i;
2162 }
2163 }
2164 return -1;
2165 }
2166
2167
2168 /***
2169 * Parse the authority component.
2170 *
2171 * @param original the original character sequence of authority component
2172 * @param escaped <code>true</code> if <code>original</code> is escaped
2173 * @throws URIException If an error occurs.
2174 */
2175 protected void parseAuthority(String original, boolean escaped)
2176 throws URIException {
2177
2178
2179 _is_reg_name = _is_server =
2180 _is_hostname = _is_IPv4address = _is_IPv6reference = false;
2181
2182
2183 String charset = getProtocolCharset();
2184
2185 boolean hasPort = true;
2186 int from = 0;
2187 int next = original.indexOf('@');
2188 if (next != -1) {
2189
2190 _userinfo = (escaped) ? original.substring(0, next).toCharArray()
2191 : encode(original.substring(0, next), allowed_userinfo,
2192 charset);
2193 from = next + 1;
2194 }
2195 next = original.indexOf('[', from);
2196 if (next >= from) {
2197 next = original.indexOf(']', from);
2198 if (next == -1) {
2199 throw new URIException(URIException.PARSING, "IPv6reference");
2200 } else {
2201 next++;
2202 }
2203
2204 _host = (escaped) ? original.substring(from, next).toCharArray()
2205 : encode(original.substring(from, next), allowed_IPv6reference,
2206 charset);
2207
2208 _is_IPv6reference = true;
2209 } else {
2210 next = original.indexOf(':', from);
2211 if (next == -1) {
2212 next = original.length();
2213 hasPort = false;
2214 }
2215
2216 _host = original.substring(from, next).toCharArray();
2217 if (validate(_host, IPv4address)) {
2218
2219 _is_IPv4address = true;
2220 } else if (validate(_host, hostname)) {
2221
2222 _is_hostname = true;
2223 } else {
2224
2225 _is_reg_name = true;
2226 }
2227 }
2228 if (_is_reg_name) {
2229
2230 _is_server = _is_hostname = _is_IPv4address =
2231 _is_IPv6reference = false;
2232
2233 if (escaped) {
2234 _authority = original.toCharArray();
2235 if (!validate(_authority, reg_name)) {
2236 throw new URIException("Invalid authority");
2237 }
2238 } else {
2239 _authority = encode(original, allowed_reg_name, charset);
2240 }
2241 } else {
2242 if (original.length() - 1 > next && hasPort
2243 && original.charAt(next) == ':') {
2244 from = next + 1;
2245 try {
2246 _port = Integer.parseInt(original.substring(from));
2247 } catch (NumberFormatException error) {
2248 throw new URIException(URIException.PARSING,
2249 "invalid port number");
2250 }
2251 }
2252
2253 StringBuffer buf = new StringBuffer();
2254 if (_userinfo != null) {
2255 buf.append(_userinfo);
2256 buf.append('@');
2257 }
2258 if (_host != null) {
2259 buf.append(_host);
2260 if (_port != -1) {
2261 buf.append(':');
2262 buf.append(_port);
2263 }
2264 }
2265 _authority = buf.toString().toCharArray();
2266
2267 _is_server = true;
2268 }
2269 }
2270
2271
2272 /***
2273 * Once it's parsed successfully, set this URI.
2274 *
2275 * @see #getRawURI
2276 */
2277 protected void setURI() {
2278
2279 StringBuffer buf = new StringBuffer();
2280
2281 if (_scheme != null) {
2282 buf.append(_scheme);
2283 buf.append(':');
2284 }
2285 if (_is_net_path) {
2286 buf.append("//");
2287 if (_authority != null) {
2288 buf.append(_authority);
2289 }
2290 }
2291 if (_opaque != null && _is_opaque_part) {
2292 buf.append(_opaque);
2293 } else if (_path != null) {
2294
2295 if (_path.length != 0) {
2296 buf.append(_path);
2297 }
2298 }
2299 if (_query != null) {
2300 buf.append('?');
2301 buf.append(_query);
2302 }
2303
2304 _uri = buf.toString().toCharArray();
2305 hash = 0;
2306 }
2307
2308
2309
2310
2311 /***
2312 * Tell whether or not this URI is absolute.
2313 *
2314 * @return true iif this URI is absoluteURI
2315 */
2316 public boolean isAbsoluteURI() {
2317 return (_scheme != null);
2318 }
2319
2320
2321 /***
2322 * Tell whether or not this URI is relative.
2323 *
2324 * @return true iif this URI is relativeURI
2325 */
2326 public boolean isRelativeURI() {
2327 return (_scheme == null);
2328 }
2329
2330
2331 /***
2332 * Tell whether or not the absoluteURI of this URI is hier_part.
2333 *
2334 * @return true iif the absoluteURI is hier_part
2335 */
2336 public boolean isHierPart() {
2337 return _is_hier_part;
2338 }
2339
2340
2341 /***
2342 * Tell whether or not the absoluteURI of this URI is opaque_part.
2343 *
2344 * @return true iif the absoluteURI is opaque_part
2345 */
2346 public boolean isOpaquePart() {
2347 return _is_opaque_part;
2348 }
2349
2350
2351 /***
2352 * Tell whether or not the relativeURI or heir_part of this URI is net_path.
2353 * It's the same function as the has_authority() method.
2354 *
2355 * @return true iif the relativeURI or heir_part is net_path
2356 * @see #hasAuthority
2357 */
2358 public boolean isNetPath() {
2359 return _is_net_path || (_authority != null);
2360 }
2361
2362
2363 /***
2364 * Tell whether or not the relativeURI or hier_part of this URI is abs_path.
2365 *
2366 * @return true iif the relativeURI or hier_part is abs_path
2367 */
2368 public boolean isAbsPath() {
2369 return _is_abs_path;
2370 }
2371
2372
2373 /***
2374 * Tell whether or not the relativeURI of this URI is rel_path.
2375 *
2376 * @return true iif the relativeURI is rel_path
2377 */
2378 public boolean isRelPath() {
2379 return _is_rel_path;
2380 }
2381
2382
2383 /***
2384 * Tell whether or not this URI has authority.
2385 * It's the same function as the is_net_path() method.
2386 *
2387 * @return true iif this URI has authority
2388 * @see #isNetPath
2389 */
2390 public boolean hasAuthority() {
2391 return (_authority != null) || _is_net_path;
2392 }
2393
2394 /***
2395 * Tell whether or not the authority component of this URI is reg_name.
2396 *
2397 * @return true iif the authority component is reg_name
2398 */
2399 public boolean isRegName() {
2400 return _is_reg_name;
2401 }
2402
2403
2404 /***
2405 * Tell whether or not the authority component of this URI is server.
2406 *
2407 * @return true iif the authority component is server
2408 */
2409 public boolean isServer() {
2410 return _is_server;
2411 }
2412
2413
2414 /***
2415 * Tell whether or not this URI has userinfo.
2416 *
2417 * @return true iif this URI has userinfo
2418 */
2419 public boolean hasUserinfo() {
2420 return (_userinfo != null);
2421 }
2422
2423
2424 /***
2425 * Tell whether or not the host part of this URI is hostname.
2426 *
2427 * @return true iif the host part is hostname
2428 */
2429 public boolean isHostname() {
2430 return _is_hostname;
2431 }
2432
2433
2434 /***
2435 * Tell whether or not the host part of this URI is IPv4address.
2436 *
2437 * @return true iif the host part is IPv4address
2438 */
2439 public boolean isIPv4address() {
2440 return _is_IPv4address;
2441 }
2442
2443
2444 /***
2445 * Tell whether or not the host part of this URI is IPv6reference.
2446 *
2447 * @return true iif the host part is IPv6reference
2448 */
2449 public boolean isIPv6reference() {
2450 return _is_IPv6reference;
2451 }
2452
2453
2454 /***
2455 * Tell whether or not this URI has query.
2456 *
2457 * @return true iif this URI has query
2458 */
2459 public boolean hasQuery() {
2460 return (_query != null);
2461 }
2462
2463
2464 /***
2465 * Tell whether or not this URI has fragment.
2466 *
2467 * @return true iif this URI has fragment
2468 */
2469 public boolean hasFragment() {
2470 return (_fragment != null);
2471 }
2472
2473
2474
2475
2476
2477 /***
2478 * Set the default charset of the protocol.
2479 * <p>
2480 * The character set used to store files SHALL remain a local decision and
2481 * MAY depend on the capability of local operating systems. Prior to the
2482 * exchange of URIs they SHOULD be converted into a ISO/IEC 10646 format
2483 * and UTF-8 encoded. This approach, while allowing international exchange
2484 * of URIs, will still allow backward compatibility with older systems
2485 * because the code set positions for ASCII characters are identical to the
2486 * one byte sequence in UTF-8.
2487 * <p>
2488 * An individual URI scheme may require a single charset, define a default
2489 * charset, or provide a way to indicate the charset used.
2490 *
2491 * <p>
2492 * Always all the time, the setter method is always succeeded and throws
2493 * <code>DefaultCharsetChanged</code> exception.
2494 *
2495 * So API programmer must follow the following way:
2496 * <code><pre>
2497 * import org.apache.util.URI$DefaultCharsetChanged;
2498 * .
2499 * .
2500 * .
2501 * try {
2502 * URI.setDefaultProtocolCharset("UTF-8");
2503 * } catch (DefaultCharsetChanged cc) {
2504 * // CASE 1: the exception could be ignored, when it is set by user
2505 * if (cc.getReasonCode() == DefaultCharsetChanged.PROTOCOL_CHARSET) {
2506 * // CASE 2: let user know the default protocol charset changed
2507 * } else {
2508 * // CASE 2: let user know the default document charset changed
2509 * }
2510 * }
2511 * </pre></code>
2512 *
2513 * The API programmer is responsible to set the correct charset.
2514 * And each application should remember its own charset to support.
2515 *
2516 * @param charset the default charset for each protocol
2517 * @throws DefaultCharsetChanged default charset changed
2518 */
2519 public static void setDefaultProtocolCharset(String charset)
2520 throws DefaultCharsetChanged {
2521
2522 defaultProtocolCharset = charset;
2523 throw new DefaultCharsetChanged(DefaultCharsetChanged.PROTOCOL_CHARSET,
2524 "the default protocol charset changed");
2525 }
2526
2527
2528 /***
2529 * Get the default charset of the protocol.
2530 * <p>
2531 * An individual URI scheme may require a single charset, define a default
2532 * charset, or provide a way to indicate the charset used.
2533 * <p>
2534 * To work globally either requires support of a number of character sets
2535 * and to be able to convert between them, or the use of a single preferred
2536 * character set.
2537 * For support of global compatibility it is STRONGLY RECOMMENDED that
2538 * clients and servers use UTF-8 encoding when exchanging URIs.
2539 *
2540 * @return the default charset string
2541 */
2542 public static String getDefaultProtocolCharset() {
2543 return defaultProtocolCharset;
2544 }
2545
2546
2547 /***
2548 * Get the protocol charset used by this current URI instance.
2549 * It was set by the constructor for this instance. If it was not set by
2550 * contructor, it will return the default protocol charset.
2551 *
2552 * @return the protocol charset string
2553 * @see #getDefaultProtocolCharset
2554 */
2555 public String getProtocolCharset() {
2556 return (protocolCharset != null)
2557 ? protocolCharset
2558 : defaultProtocolCharset;
2559 }
2560
2561
2562 /***
2563 * Set the default charset of the document.
2564 * <p>
2565 * Notice that it will be possible to contain mixed characters (e.g.
2566 * ftp://host/KoreanNamespace/ChineseResource). To handle the Bi-directional
2567 * display of these character sets, the protocol charset could be simply
2568 * used again. Because it's not yet implemented that the insertion of BIDI
2569 * control characters at different points during composition is extracted.
2570 * <p>
2571 *
2572 * Always all the time, the setter method is always succeeded and throws
2573 * <code>DefaultCharsetChanged</code> exception.
2574 *
2575 * So API programmer must follow the following way:
2576 * <code><pre>
2577 * import org.apache.util.URI$DefaultCharsetChanged;
2578 * .
2579 * .
2580 * .
2581 * try {
2582 * URI.setDefaultDocumentCharset("EUC-KR");
2583 * } catch (DefaultCharsetChanged cc) {
2584 * // CASE 1: the exception could be ignored, when it is set by user
2585 * if (cc.getReasonCode() == DefaultCharsetChanged.DOCUMENT_CHARSET) {
2586 * // CASE 2: let user know the default document charset changed
2587 * } else {
2588 * // CASE 2: let user know the default protocol charset changed
2589 * }
2590 * }
2591 * </pre></code>
2592 *
2593 * The API programmer is responsible to set the correct charset.
2594 * And each application should remember its own charset to support.
2595 *
2596 * @param charset the default charset for the document
2597 * @throws DefaultCharsetChanged default charset changed
2598 */
2599 public static void setDefaultDocumentCharset(String charset)
2600 throws DefaultCharsetChanged {
2601
2602 defaultDocumentCharset = charset;
2603 throw new DefaultCharsetChanged(DefaultCharsetChanged.DOCUMENT_CHARSET,
2604 "the default document charset changed");
2605 }
2606
2607
2608 /***
2609 * Get the recommended default charset of the document.
2610 *
2611 * @return the default charset string
2612 */
2613 public static String getDefaultDocumentCharset() {
2614 return defaultDocumentCharset;
2615 }
2616
2617
2618 /***
2619 * Get the default charset of the document by locale.
2620 *
2621 * @return the default charset string by locale
2622 */
2623 public static String getDefaultDocumentCharsetByLocale() {
2624 return defaultDocumentCharsetByLocale;
2625 }
2626
2627
2628 /***
2629 * Get the default charset of the document by platform.
2630 *
2631 * @return the default charset string by platform
2632 */
2633 public static String getDefaultDocumentCharsetByPlatform() {
2634 return defaultDocumentCharsetByPlatform;
2635 }
2636
2637
2638
2639 /***
2640 * Get the scheme.
2641 *
2642 * @return the scheme
2643 */
2644 public char[] getRawScheme() {
2645 return _scheme;
2646 }
2647
2648
2649 /***
2650 * Get the scheme.
2651 *
2652 * @return the scheme
2653 * null if undefined scheme
2654 */
2655 public String getScheme() {
2656 return (_scheme == null) ? null : new String(_scheme);
2657 }
2658
2659
2660
2661 /***
2662 * Set the authority. It can be one type of server, hostport, hostname,
2663 * IPv4address, IPv6reference and reg_name.
2664 * <p><blockquote><pre>
2665 * authority = server | reg_name
2666 * </pre></blockquote><p>
2667 *
2668 * @param escapedAuthority the raw escaped authority
2669 * @throws URIException If {@link
2670 * #parseAuthority(java.lang.String,boolean)} fails
2671 * @throws NullPointerException null authority
2672 */
2673 public void setRawAuthority(char[] escapedAuthority)
2674 throws URIException, NullPointerException {
2675
2676 parseAuthority(new String(escapedAuthority), true);
2677 setURI();
2678 }
2679
2680
2681 /***
2682 * Set the authority. It can be one type of server, hostport, hostname,
2683 * IPv4address, IPv6reference and reg_name.
2684 * Note that there is no setAuthority method by the escape encoding reason.
2685 *
2686 * @param escapedAuthority the escaped authority string
2687 * @throws URIException If {@link
2688 * #parseAuthority(java.lang.String,boolean)} fails
2689 */
2690 public void setEscapedAuthority(String escapedAuthority)
2691 throws URIException {
2692
2693 parseAuthority(escapedAuthority, true);
2694 setURI();
2695 }
2696
2697
2698 /***
2699 * Get the raw-escaped authority.
2700 *
2701 * @return the raw-escaped authority
2702 */
2703 public char[] getRawAuthority() {
2704 return _authority;
2705 }
2706
2707
2708 /***
2709 * Get the escaped authority.
2710 *
2711 * @return the escaped authority
2712 */
2713 public String getEscapedAuthority() {
2714 return (_authority == null) ? null : new String(_authority);
2715 }
2716
2717
2718 /***
2719 * Get the authority.
2720 *
2721 * @return the authority
2722 * @throws URIException If {@link #decode} fails
2723 */
2724 public String getAuthority() throws URIException {
2725 return (_authority == null) ? null : decode(_authority,
2726 getProtocolCharset());
2727 }
2728
2729
2730
2731 /***
2732 * Get the raw-escaped userinfo.
2733 *
2734 * @return the raw-escaped userinfo
2735 * @see #getAuthority
2736 */
2737 public char[] getRawUserinfo() {
2738 return _userinfo;
2739 }
2740
2741
2742 /***
2743 * Get the escaped userinfo.
2744 *
2745 * @return the escaped userinfo
2746 * @see #getAuthority
2747 */
2748 public String getEscapedUserinfo() {
2749 return (_userinfo == null) ? null : new String(_userinfo);
2750 }
2751
2752
2753 /***
2754 * Get the userinfo.
2755 *
2756 * @return the userinfo
2757 * @throws URIException If {@link #decode} fails
2758 * @see #getAuthority
2759 */
2760 public String getUserinfo() throws URIException {
2761 return (_userinfo == null) ? null : decode(_userinfo,
2762 getProtocolCharset());
2763 }
2764
2765
2766
2767 /***
2768 * Get the host.
2769 * <p><blockquote><pre>
2770 * host = hostname | IPv4address | IPv6reference
2771 * </pre></blockquote><p>
2772 *
2773 * @return the host
2774 * @see #getAuthority
2775 */
2776 public char[] getRawHost() {
2777 return _host;
2778 }
2779
2780
2781 /***
2782 * Get the host.
2783 * <p><blockquote><pre>
2784 * host = hostname | IPv4address | IPv6reference
2785 * </pre></blockquote><p>
2786 *
2787 * @return the host
2788 * @throws URIException If {@link #decode} fails
2789 * @see #getAuthority
2790 */
2791 public String getHost() throws URIException {
2792 if (_host != null) {
2793 return decode(_host, getProtocolCharset());
2794 } else {
2795 return null;
2796 }
2797 }
2798
2799
2800
2801 /***
2802 * Get the port. In order to get the specfic default port, the specific
2803 * protocol-supported class extended from the URI class should be used.
2804 * It has the server-based naming authority.
2805 *
2806 * @return the port
2807 * if -1, it has the default port for the scheme or the server-based
2808 * naming authority is not supported in the specific URI.
2809 */
2810 public int getPort() {
2811 return _port;
2812 }
2813
2814
2815
2816 /***
2817 * Set the raw-escaped path.
2818 *
2819 * @param escapedPath the path character sequence
2820 * @throws URIException encoding error or not proper for initial instance
2821 * @see #encode
2822 */
2823 public void setRawPath(char[] escapedPath) throws URIException {
2824 if (escapedPath == null || escapedPath.length == 0) {
2825 _path = _opaque = escapedPath;
2826 setURI();
2827 return;
2828 }
2829
2830 escapedPath = removeFragmentIdentifier(escapedPath);
2831 if (_is_net_path || _is_abs_path) {
2832 if (escapedPath[0] != '/') {
2833 throw new URIException(URIException.PARSING,
2834 "not absolute path");
2835 }
2836 if (!validate(escapedPath, abs_path)) {
2837 throw new URIException(URIException.ESCAPING,
2838 "escaped absolute path not valid");
2839 }
2840 _path = escapedPath;
2841 } else if (_is_rel_path) {
2842 int at = indexFirstOf(escapedPath, '/');
2843 if (at == 0) {
2844 throw new URIException(URIException.PARSING, "incorrect path");
2845 }
2846 if (at > 0 && !validate(escapedPath, 0, at - 1, rel_segment)
2847 && !validate(escapedPath, at, -1, abs_path)
2848 || at < 0 && !validate(escapedPath, 0, -1, rel_segment)) {
2849
2850 throw new URIException(URIException.ESCAPING,
2851 "escaped relative path not valid");
2852 }
2853 _path = escapedPath;
2854 } else if (_is_opaque_part) {
2855 if (!uric_no_slash.get(escapedPath[0])
2856 && !validate(escapedPath, 1, -1, uric)) {
2857 throw new URIException(URIException.ESCAPING,
2858 "escaped opaque part not valid");
2859 }
2860 _opaque = escapedPath;
2861 } else {
2862 throw new URIException(URIException.PARSING, "incorrect path");
2863 }
2864 setURI();
2865 }
2866
2867
2868 /***
2869 * Set the escaped path.
2870 *
2871 * @param escapedPath the escaped path string
2872 * @throws URIException encoding error or not proper for initial instance
2873 * @see #encode
2874 */
2875 public void setEscapedPath(String escapedPath) throws URIException {
2876 if (escapedPath == null) {
2877 _path = _opaque = null;
2878 setURI();
2879 return;
2880 }
2881 setRawPath(escapedPath.toCharArray());
2882 }
2883
2884
2885 /***
2886 * Set the path.
2887 *
2888 * @param path the path string
2889 * @throws URIException set incorrectly or fragment only
2890 * @see #encode
2891 */
2892 public void setPath(String path) throws URIException {
2893
2894 if (path == null || path.length() == 0) {
2895 _path = _opaque = (path == null) ? null : path.toCharArray();
2896 setURI();
2897 return;
2898 }
2899
2900 String charset = getProtocolCharset();
2901
2902 if (_is_net_path || _is_abs_path) {
2903 _path = encode(path, allowed_abs_path, charset);
2904 } else if (_is_rel_path) {
2905 StringBuffer buff = new StringBuffer(path.length());
2906 int at = path.indexOf('/');
2907 if (at == 0) {
2908 throw new URIException(URIException.PARSING,
2909 "incorrect relative path");
2910 }
2911 if (at > 0) {
2912 buff.append(encode(path.substring(0, at), allowed_rel_path,
2913 charset));
2914 buff.append(encode(path.substring(at), allowed_abs_path,
2915 charset));
2916 } else {
2917 buff.append(encode(path, allowed_rel_path, charset));
2918 }
2919 _path = buff.toString().toCharArray();
2920 } else if (_is_opaque_part) {
2921 StringBuffer buf = new StringBuffer();
2922 buf.insert(0, encode(path.substring(0, 1), uric_no_slash, charset));
2923 buf.insert(1, encode(path.substring(1), uric, charset));
2924 _opaque = buf.toString().toCharArray();
2925 } else {
2926 throw new URIException(URIException.PARSING, "incorrect path");
2927 }
2928 setURI();
2929 }
2930
2931
2932 /***
2933 * Resolve the base and relative path.
2934 *
2935 * @param basePath a character array of the basePath
2936 * @param relPath a character array of the relPath
2937 * @return the resolved path
2938 * @throws URIException no more higher path level to be resolved
2939 */
2940 protected char[] resolvePath(char[] basePath, char[] relPath)
2941 throws URIException {
2942
2943
2944 String base = (basePath == null) ? "" : new String(basePath);
2945 int at = base.lastIndexOf('/');
2946 if (at != -1) {
2947 basePath = base.substring(0, at + 1).toCharArray();
2948 }
2949
2950 if (relPath == null || relPath.length == 0) {
2951 return normalize(basePath);
2952 } else if (relPath[0] == '/') {
2953 return normalize(relPath);
2954 } else {
2955 StringBuffer buff = new StringBuffer(base.length()
2956 + relPath.length);
2957 buff.append((at != -1) ? base.substring(0, at + 1) : "/");
2958 buff.append(relPath);
2959 return normalize(buff.toString().toCharArray());
2960 }
2961 }
2962
2963
2964 /***
2965 * Get the raw-escaped current hierarchy level in the given path.
2966 * If the last namespace is a collection, the slash mark ('/') should be
2967 * ended with at the last character of the path string.
2968 *
2969 * @param path the path
2970 * @return the current hierarchy level
2971 * @throws URIException no hierarchy level
2972 */
2973 protected char[] getRawCurrentHierPath(char[] path) throws URIException {
2974
2975 if (_is_opaque_part) {
2976 throw new URIException(URIException.PARSING, "no hierarchy level");
2977 }
2978 if (path == null) {
2979 throw new URIException(URIException.PARSING, "empty path");
2980 }
2981 String buff = new String(path);
2982 int first = buff.indexOf('/');
2983 int last = buff.lastIndexOf('/');
2984 if (last == 0) {
2985 return rootPath;
2986 } else if (first != last && last != -1) {
2987 return buff.substring(0, last).toCharArray();
2988 }
2989
2990 return path;
2991 }
2992
2993
2994 /***
2995 * Get the raw-escaped current hierarchy level.
2996 *
2997 * @return the raw-escaped current hierarchy level
2998 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
2999 */
3000 public char[] getRawCurrentHierPath() throws URIException {
3001 return (_path == null) ? null : getRawCurrentHierPath(_path);
3002 }
3003
3004
3005 /***
3006 * Get the escaped current hierarchy level.
3007 *
3008 * @return the escaped current hierarchy level
3009 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3010 */
3011 public String getEscapedCurrentHierPath() throws URIException {
3012 char[] path = getRawCurrentHierPath();
3013 return (path == null) ? null : new String(path);
3014 }
3015
3016
3017 /***
3018 * Get the current hierarchy level.
3019 *
3020 * @return the current hierarchy level
3021 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3022 * @see #decode
3023 */
3024 public String getCurrentHierPath() throws URIException {
3025 char[] path = getRawCurrentHierPath();
3026 return (path == null) ? null : decode(path, getProtocolCharset());
3027 }
3028
3029
3030 /***
3031 * Get the level above the this hierarchy level.
3032 *
3033 * @return the raw above hierarchy level
3034 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3035 */
3036 public char[] getRawAboveHierPath() throws URIException {
3037 char[] path = getRawCurrentHierPath();
3038 return (path == null) ? null : getRawCurrentHierPath(path);
3039 }
3040
3041
3042 /***
3043 * Get the level above the this hierarchy level.
3044 *
3045 * @return the raw above hierarchy level
3046 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3047 */
3048 public String getEscapedAboveHierPath() throws URIException {
3049 char[] path = getRawAboveHierPath();
3050 return (path == null) ? null : new String(path);
3051 }
3052
3053
3054 /***
3055 * Get the level above the this hierarchy level.
3056 *
3057 * @return the above hierarchy level
3058 * @throws URIException If {@link #getRawCurrentHierPath(char[])} fails.
3059 * @see #decode
3060 */
3061 public String getAboveHierPath() throws URIException {
3062 char[] path = getRawAboveHierPath();
3063 return (path == null) ? null : decode(path, getProtocolCharset());
3064 }
3065
3066
3067 /***
3068 * Get the raw-escaped path.
3069 * <p><blockquote><pre>
3070 * path = [ abs_path | opaque_part ]
3071 * </pre></blockquote><p>
3072 *
3073 * @return the raw-escaped path
3074 */
3075 public char[] getRawPath() {
3076 return _is_opaque_part ? _opaque : _path;
3077 }
3078
3079
3080 /***
3081 * Get the escaped path.
3082 * <p><blockquote><pre>
3083 * path = [ abs_path | opaque_part ]
3084 * abs_path = "/" path_segments
3085 * opaque_part = uric_no_slash *uric
3086 * </pre></blockquote><p>
3087 *
3088 * @return the escaped path string
3089 */
3090 public String getEscapedPath() {
3091 char[] path = getRawPath();
3092 return (path == null) ? null : new String(path);
3093 }
3094
3095
3096 /***
3097 * Get the path.
3098 * <p><blockquote><pre>
3099 * path = [ abs_path | opaque_part ]
3100 * </pre></blockquote><p>
3101 * @return the path string
3102 * @throws URIException If {@link #decode} fails.
3103 * @see #decode
3104 */
3105 public String getPath() throws URIException {
3106 char[] path = getRawPath();
3107 return (path == null) ? null : decode(path, getProtocolCharset());
3108 }
3109
3110
3111 /***
3112 * Get the raw-escaped basename of the path.
3113 *
3114 * @return the raw-escaped basename
3115 */
3116 public char[] getRawName() {
3117 if (_path == null) {
3118 return null;
3119 }
3120
3121 int at = 0;
3122 for (int i = _path.length - 1; i >= 0; i--) {
3123 if (_path[i] == '/') {
3124 at = i + 1;
3125 break;
3126 }
3127 }
3128 int len = _path.length - at;
3129 char[] basename = new char[len];
3130 System.arraycopy(_path, at, basename, 0, len);
3131 return basename;
3132 }
3133
3134
3135 /***
3136 * Get the escaped basename of the path.
3137 *
3138 * @return the escaped basename string
3139 */
3140 public String getEscapedName() {
3141 char[] basename = getRawName();
3142 return (basename == null) ? null : new String(basename);
3143 }
3144
3145
3146 /***
3147 * Get the basename of the path.
3148 *
3149 * @return the basename string
3150 * @throws URIException incomplete trailing escape pattern or unsupported
3151 * character encoding
3152 * @see #decode
3153 */
3154 public String getName() throws URIException {
3155 char[] basename = getRawName();
3156 return (basename == null) ? null : decode(getRawName(),
3157 getProtocolCharset());
3158 }
3159
3160
3161
3162 /***
3163 * Get the raw-escaped path and query.
3164 *
3165 * @return the raw-escaped path and query
3166 */
3167 public char[] getRawPathQuery() {
3168
3169 if (_path == null && _query == null) {
3170 return null;
3171 }
3172 StringBuffer buff = new StringBuffer();
3173 if (_path != null) {
3174 buff.append(_path);
3175 }
3176 if (_query != null) {
3177 buff.append('?');
3178 buff.append(_query);
3179 }
3180 return buff.toString().toCharArray();
3181 }
3182
3183
3184 /***
3185 * Get the escaped query.
3186 *
3187 * @return the escaped path and query string
3188 */
3189 public String getEscapedPathQuery() {
3190 char[] rawPathQuery = getRawPathQuery();
3191 return (rawPathQuery == null) ? null : new String(rawPathQuery);
3192 }
3193
3194
3195 /***
3196 * Get the path and query.
3197 *
3198 * @return the path and query string.
3199 * @throws URIException incomplete trailing escape pattern or unsupported
3200 * character encoding
3201 * @see #decode
3202 */
3203 public String getPathQuery() throws URIException {
3204 char[] rawPathQuery = getRawPathQuery();
3205 return (rawPathQuery == null) ? null : decode(rawPathQuery,
3206 getProtocolCharset());
3207 }
3208
3209
3210
3211 /***
3212 * Set the raw-escaped query.
3213 *
3214 * @param escapedQuery the raw-escaped query
3215 * @throws URIException escaped query not valid
3216 */
3217 public void setRawQuery(char[] escapedQuery) throws URIException {
3218 if (escapedQuery == null || escapedQuery.length == 0) {
3219 _query = escapedQuery;
3220 setURI();
3221 return;
3222 }
3223
3224 escapedQuery = removeFragmentIdentifier(escapedQuery);
3225 if (!validate(escapedQuery, query)) {
3226 throw new URIException(URIException.ESCAPING,
3227 "escaped query not valid");
3228 }
3229 _query = escapedQuery;
3230 setURI();
3231 }
3232
3233
3234 /***
3235 * Set the escaped query string.
3236 *
3237 * @param escapedQuery the escaped query string
3238 * @throws URIException escaped query not valid
3239 */
3240 public void setEscapedQuery(String escapedQuery) throws URIException {
3241 if (escapedQuery == null) {
3242 _query = null;
3243 setURI();
3244 return;
3245 }
3246 setRawQuery(escapedQuery.toCharArray());
3247 }
3248
3249
3250 /***
3251 * Set the query.
3252 * <p>
3253 * When a query string is not misunderstood the reserved special characters
3254 * ("&", "=", "+", ",", and "$") within a query component, it is
3255 * recommended to use in encoding the whole query with this method.
3256 * <p>
3257 * The additional APIs for the special purpose using by the reserved
3258 * special characters used in each protocol are implemented in each protocol
3259 * classes inherited from <code>URI</code>. So refer to the same-named APIs
3260 * implemented in each specific protocol instance.
3261 *
3262 * @param query the query string.
3263 * @throws URIException incomplete trailing escape pattern or unsupported
3264 * character encoding
3265 * @see #encode
3266 */
3267 public void setQuery(String query) throws URIException {
3268 if (query == null || query.length() == 0) {
3269 _query = (query == null) ? null : query.toCharArray();
3270 setURI();
3271 return;
3272 }
3273 setRawQuery(encode(query, allowed_query, getProtocolCharset()));
3274 }
3275
3276
3277 /***
3278 * Get the raw-escaped query.
3279 *
3280 * @return the raw-escaped query
3281 */
3282 public char[] getRawQuery() {
3283 return _query;
3284 }
3285
3286
3287 /***
3288 * Get the escaped query.
3289 *
3290 * @return the escaped query string
3291 */
3292 public String getEscapedQuery() {
3293 return (_query == null) ? null : new String(_query);
3294 }
3295
3296
3297 /***
3298 * Get the query.
3299 *
3300 * @return the query string.
3301 * @throws URIException incomplete trailing escape pattern or unsupported
3302 * character encoding
3303 * @see #decode
3304 */
3305 public String getQuery() throws URIException {
3306 return (_query == null) ? null : decode(_query, getProtocolCharset());
3307 }
3308
3309
3310
3311 /***
3312 * Set the raw-escaped fragment.
3313 *
3314 * @param escapedFragment the raw-escaped fragment
3315 * @throws URIException escaped fragment not valid
3316 */
3317 public void setRawFragment(char[] escapedFragment) throws URIException {
3318 if (escapedFragment == null || escapedFragment.length == 0) {
3319 _fragment = escapedFragment;
3320 hash = 0;
3321 return;
3322 }
3323 if (!validate(escapedFragment, fragment)) {
3324 throw new URIException(URIException.ESCAPING,
3325 "escaped fragment not valid");
3326 }
3327 _fragment = escapedFragment;
3328 hash = 0;
3329 }
3330
3331
3332 /***
3333 * Set the escaped fragment string.
3334 *
3335 * @param escapedFragment the escaped fragment string
3336 * @throws URIException escaped fragment not valid
3337 */
3338 public void setEscapedFragment(String escapedFragment) throws URIException {
3339 if (escapedFragment == null) {
3340 _fragment = null;
3341 hash = 0;
3342 return;
3343 }
3344 setRawFragment(escapedFragment.toCharArray());
3345 }
3346
3347
3348 /***
3349 * Set the fragment.
3350 *
3351 * @param fragment the fragment string.
3352 * @throws URIException If an error occurs.
3353 */
3354 public void setFragment(String fragment) throws URIException {
3355 if (fragment == null || fragment.length() == 0) {
3356 _fragment = (fragment == null) ? null : fragment.toCharArray();
3357 hash = 0;
3358 return;
3359 }
3360 _fragment = encode(fragment, allowed_fragment, getProtocolCharset());
3361 hash = 0;
3362 }
3363
3364
3365 /***
3366 * Get the raw-escaped fragment.
3367 * <p>
3368 * The optional fragment identifier is not part of a URI, but is often used
3369 * in conjunction with a URI.
3370 * <p>
3371 * The format and interpretation of fragment identifiers is dependent on
3372 * the media type [RFC2046] of the retrieval result.
3373 * <p>
3374 * A fragment identifier is only meaningful when a URI reference is
3375 * intended for retrieval and the result of that retrieval is a document
3376 * for which the identified fragment is consistently defined.
3377 *
3378 * @return the raw-escaped fragment
3379 */
3380 public char[] getRawFragment() {
3381 return _fragment;
3382 }
3383
3384
3385 /***
3386 * Get the escaped fragment.
3387 *
3388 * @return the escaped fragment string
3389 */
3390 public String getEscapedFragment() {
3391 return (_fragment == null) ? null : new String(_fragment);
3392 }
3393
3394
3395 /***
3396 * Get the fragment.
3397 *
3398 * @return the fragment string
3399 * @throws URIException incomplete trailing escape pattern or unsupported
3400 * character encoding
3401 * @see #decode
3402 */
3403 public String getFragment() throws URIException {
3404 return (_fragment == null) ? null : decode(_fragment,
3405 getProtocolCharset());
3406 }
3407
3408
3409
3410 /***
3411 * Remove the fragment identifier of the given component.
3412 *
3413 * @param component the component that a fragment may be included
3414 * @return the component that the fragment identifier is removed
3415 */
3416 protected char[] removeFragmentIdentifier(char[] component) {
3417 if (component == null) {
3418 return null;
3419 }
3420 int lastIndex = new String(component).indexOf('#');
3421 if (lastIndex != -1) {
3422 component = new String(component).substring(0,
3423 lastIndex).toCharArray();
3424 }
3425 return component;
3426 }
3427
3428
3429 /***
3430 * Normalize the given hier path part.
3431 *
3432 * <p>Algorithm taken from URI reference parser at
3433 * http://www.apache.org/~fielding/uri/rev-2002/issues.html.
3434 *
3435 * @param path the path to normalize
3436 * @return the normalized path
3437 * @throws URIException no more higher path level to be normalized
3438 */
3439 protected char[] normalize(char[] path) throws URIException {
3440
3441 if (path == null) {
3442 return null;
3443 }
3444
3445 String normalized = new String(path);
3446
3447
3448 if (normalized.startsWith("./")) {
3449 normalized = normalized.substring(1);
3450 } else if (normalized.startsWith("../")) {
3451 normalized = normalized.substring(2);
3452 } else if (normalized.startsWith("..")) {
3453 normalized = normalized.substring(2);
3454 }
3455
3456
3457 int index = -1;
3458 while ((index = normalized.indexOf("/./")) != -1) {
3459 normalized = normalized.substring(0, index) + normalized.substring(index + 2);
3460 }
3461
3462
3463 if (normalized.endsWith("/.")) {
3464 normalized = normalized.substring(0, normalized.length() - 1);
3465 }
3466
3467 int startIndex = 0;
3468
3469
3470
3471
3472
3473
3474 while ((index = normalized.indexOf("/../", startIndex)) != -1) {
3475 int slashIndex = normalized.lastIndexOf('/', index - 1);
3476 if (slashIndex >= 0) {
3477 normalized = normalized.substring(0, slashIndex) + normalized.substring(index + 3);
3478 } else {
3479 startIndex = index + 3;
3480 }
3481 }
3482 if (normalized.endsWith("/..")) {
3483 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3484 if (slashIndex >= 0) {
3485 normalized = normalized.substring(0, slashIndex + 1);
3486 }
3487 }
3488
3489
3490
3491
3492
3493
3494 while ((index = normalized.indexOf("/../")) != -1) {
3495 int slashIndex = normalized.lastIndexOf('/', index - 1);
3496 if (slashIndex >= 0) {
3497 break;
3498 } else {
3499 normalized = normalized.substring(index + 3);
3500 }
3501 }
3502 if (normalized.endsWith("/..")) {
3503 int slashIndex = normalized.lastIndexOf('/', normalized.length() - 4);
3504 if (slashIndex < 0) {
3505 normalized = "/";
3506 }
3507 }
3508
3509 return normalized.toCharArray();
3510 }
3511
3512
3513 /***
3514 * Normalizes the path part of this URI. Normalization is only meant to be performed on
3515 * URIs with an absolute path. Calling this method on a relative path URI will have no
3516 * effect.
3517 *
3518 * @throws URIException no more higher path level to be normalized
3519 *
3520 * @see #isAbsPath()
3521 */
3522 public void normalize() throws URIException {
3523 if (isAbsPath()) {
3524 _path = normalize(_path);
3525 setURI();
3526 }
3527 }
3528
3529
3530 /***
3531 * Test if the first array is equal to the second array.
3532 *
3533 * @param first the first character array
3534 * @param second the second character array
3535 * @return true if they're equal
3536 */
3537 protected boolean equals(char[] first, char[] second) {
3538
3539 if (first == null && second == null) {
3540 return true;
3541 }
3542 if (first == null || second == null) {
3543 return false;
3544 }
3545 if (first.length != second.length) {
3546 return false;
3547 }
3548 for (int i = 0; i < first.length; i++) {
3549 if (first[i] != second[i]) {
3550 return false;
3551 }
3552 }
3553 return true;
3554 }
3555
3556
3557 /***
3558 * Test an object if this URI is equal to another.
3559 *
3560 * @param obj an object to compare
3561 * @return true if two URI objects are equal
3562 */
3563 public boolean equals(Object obj) {
3564
3565
3566 if (obj == this) {
3567 return true;
3568 }
3569 if (!(obj instanceof URI)) {
3570 return false;
3571 }
3572 URI another = (URI) obj;
3573
3574 if (!equals(_scheme, another._scheme)) {
3575 return false;
3576 }
3577
3578 if (!equals(_opaque, another._opaque)) {
3579 return false;
3580 }
3581
3582
3583 if (!equals(_authority, another._authority)) {
3584 return false;
3585 }
3586
3587 if (!equals(_path, another._path)) {
3588 return false;
3589 }
3590
3591 if (!equals(_query, another._query)) {
3592 return false;
3593 }
3594
3595 if (!equals(_fragment, another._fragment)) {
3596 return false;
3597 }
3598 return true;
3599 }
3600
3601
3602
3603 /***
3604 * Write the content of this URI.
3605 *
3606 * @param oos the object-output stream
3607 * @throws IOException If an IO problem occurs.
3608 */
3609 private void writeObject(ObjectOutputStream oos)
3610 throws IOException {
3611
3612 oos.defaultWriteObject();
3613 }
3614
3615
3616 /***
3617 * Read a URI.
3618 *
3619 * @param ois the object-input stream
3620 * @throws ClassNotFoundException If one of the classes specified in the
3621 * input stream cannot be found.
3622 * @throws IOException If an IO problem occurs.
3623 */
3624 private void readObject(ObjectInputStream ois)
3625 throws ClassNotFoundException, IOException {
3626
3627 ois.defaultReadObject();
3628 }
3629
3630
3631
3632 /***
3633 * Return a hash code for this URI.
3634 *
3635 * @return a has code value for this URI
3636 */
3637 public int hashCode() {
3638 if (hash == 0) {
3639 char[] c = _uri;
3640 if (c != null) {
3641 for (int i = 0, len = c.length; i < len; i++) {
3642 hash = 31 * hash + c[i];
3643 }
3644 }
3645 c = _fragment;
3646 if (c != null) {
3647 for (int i = 0, len = c.length; i < len; i++) {
3648 hash = 31 * hash + c[i];
3649 }
3650 }
3651 }
3652 return hash;
3653 }
3654
3655
3656
3657 /***
3658 * Compare this URI to another object.
3659 *
3660 * @param obj the object to be compared.
3661 * @return 0, if it's same,
3662 * -1, if failed, first being compared with in the authority component
3663 * @throws ClassCastException not URI argument
3664 */
3665 public int compareTo(Object obj) throws ClassCastException {
3666
3667 URI another = (URI) obj;
3668 if (!equals(_authority, another.getRawAuthority())) {
3669 return -1;
3670 }
3671 return toString().compareTo(another.toString());
3672 }
3673
3674
3675
3676 /***
3677 * Create and return a copy of this object, the URI-reference containing
3678 * the userinfo component. Notice that the whole URI-reference including
3679 * the userinfo component counld not be gotten as a <code>String</code>.
3680 * <p>
3681 * To copy the identical <code>URI</code> object including the userinfo
3682 * component, it should be used.
3683 *
3684 * @return a clone of this instance
3685 */
3686 public synchronized Object clone() throws CloneNotSupportedException {
3687
3688 URI instance = (URI) super.clone();
3689
3690 instance._uri = _uri;
3691 instance._scheme = _scheme;
3692 instance._opaque = _opaque;
3693 instance._authority = _authority;
3694 instance._userinfo = _userinfo;
3695 instance._host = _host;
3696 instance._port = _port;
3697 instance._path = _path;
3698 instance._query = _query;
3699 instance._fragment = _fragment;
3700
3701 instance.protocolCharset = protocolCharset;
3702
3703 instance._is_hier_part = _is_hier_part;
3704 instance._is_opaque_part = _is_opaque_part;
3705 instance._is_net_path = _is_net_path;
3706 instance._is_abs_path = _is_abs_path;
3707 instance._is_rel_path = _is_rel_path;
3708 instance._is_reg_name = _is_reg_name;
3709 instance._is_server = _is_server;
3710 instance._is_hostname = _is_hostname;
3711 instance._is_IPv4address = _is_IPv4address;
3712 instance._is_IPv6reference = _is_IPv6reference;
3713
3714 return instance;
3715 }
3716
3717
3718
3719 /***
3720 * It can be gotten the URI character sequence. It's raw-escaped.
3721 * For the purpose of the protocol to be transported, it will be useful.
3722 * <p>
3723 * It is clearly unwise to use a URL that contains a password which is
3724 * intended to be secret. In particular, the use of a password within
3725 * the 'userinfo' component of a URL is strongly disrecommended except
3726 * in those rare cases where the 'password' parameter is intended to be
3727 * public.
3728 * <p>
3729 * When you want to get each part of the userinfo, you need to use the
3730 * specific methods in the specific URL. It depends on the specific URL.
3731 *
3732 * @return the URI character sequence
3733 */
3734 public char[] getRawURI() {
3735 return _uri;
3736 }
3737
3738
3739 /***
3740 * It can be gotten the URI character sequence. It's escaped.
3741 * For the purpose of the protocol to be transported, it will be useful.
3742 *
3743 * @return the escaped URI string
3744 */
3745 public String getEscapedURI() {
3746 return (_uri == null) ? null : new String(_uri);
3747 }
3748
3749
3750 /***
3751 * It can be gotten the URI character sequence.
3752 *
3753 * @return the original URI string
3754 * @throws URIException incomplete trailing escape pattern or unsupported
3755 * character encoding
3756 * @see #decode
3757 */
3758 public String getURI() throws URIException {
3759 return (_uri == null) ? null : decode(_uri, getProtocolCharset());
3760 }
3761
3762
3763 /***
3764 * Get the URI reference character sequence.
3765 *
3766 * @return the URI reference character sequence
3767 */
3768 public char[] getRawURIReference() {
3769 if (_fragment == null) {
3770 return _uri;
3771 }
3772 if (_uri == null) {
3773 return _fragment;
3774 }
3775
3776 String uriReference = new String(_uri) + "#" + new String(_fragment);
3777 return uriReference.toCharArray();
3778 }
3779
3780
3781 /***
3782 * Get the escaped URI reference string.
3783 *
3784 * @return the escaped URI reference string
3785 */
3786 public String getEscapedURIReference() {
3787 char[] uriReference = getRawURIReference();
3788 return (uriReference == null) ? null : new String(uriReference);
3789 }
3790
3791
3792 /***
3793 * Get the original URI reference string.
3794 *
3795 * @return the original URI reference string
3796 * @throws URIException If {@link #decode} fails.
3797 */
3798 public String getURIReference() throws URIException {
3799 char[] uriReference = getRawURIReference();
3800 return (uriReference == null) ? null : decode(uriReference,
3801 getProtocolCharset());
3802 }
3803
3804
3805 /***
3806 * Get the escaped URI string.
3807 * <p>
3808 * On the document, the URI-reference form is only used without the userinfo
3809 * component like http://jakarta.apache.org/ by the security reason.
3810 * But the URI-reference form with the userinfo component could be parsed.
3811 * <p>
3812 * In other words, this URI and any its subclasses must not expose the
3813 * URI-reference expression with the userinfo component like
3814 * http://user:password@hostport/restricted_zone.<br>
3815 * It means that the API client programmer should extract each user and
3816 * password to access manually. Probably it will be supported in the each
3817 * subclass, however, not a whole URI-reference expression.
3818 *
3819 * @return the escaped URI string
3820 * @see #clone()
3821 */
3822 public String toString() {
3823 return getEscapedURI();
3824 }
3825
3826
3827
3828
3829 /***
3830 * The charset-changed normal operation to represent to be required to
3831 * alert to user the fact the default charset is changed.
3832 */
3833 public static class DefaultCharsetChanged extends RuntimeException {
3834
3835
3836
3837 /***
3838 * The constructor with a reason string and its code arguments.
3839 *
3840 * @param reasonCode the reason code
3841 * @param reason the reason
3842 */
3843 public DefaultCharsetChanged(int reasonCode, String reason) {
3844 super(reason);
3845 this.reason = reason;
3846 this.reasonCode = reasonCode;
3847 }
3848
3849
3850
3851 /*** No specified reason code. */
3852 public static final int UNKNOWN = 0;
3853
3854 /*** Protocol charset changed. */
3855 public static final int PROTOCOL_CHARSET = 1;
3856
3857 /*** Document charset changed. */
3858 public static final int DOCUMENT_CHARSET = 2;
3859
3860
3861
3862 /*** The reason code. */
3863 private int reasonCode;
3864
3865 /*** The reason message. */
3866 private String reason;
3867
3868
3869
3870 /***
3871 * Get the reason code.
3872 *
3873 * @return the reason code
3874 */
3875 public int getReasonCode() {
3876 return reasonCode;
3877 }
3878
3879 /***
3880 * Get the reason message.
3881 *
3882 * @return the reason message
3883 */
3884 public String getReason() {
3885 return reason;
3886 }
3887
3888 }
3889
3890
3891 /***
3892 * A mapping to determine the (somewhat arbitrarily) preferred charset for a
3893 * given locale. Supports all locales recognized in JDK 1.1.
3894 * <p>
3895 * The distribution of this class is Servlets.com. It was originally
3896 * written by Jason Hunter [jhunter at acm.org] and used by with permission.
3897 */
3898 public static class LocaleToCharsetMap {
3899
3900 /*** A mapping of language code to charset */
3901 private static final Hashtable LOCALE_TO_CHARSET_MAP;
3902 static {
3903 LOCALE_TO_CHARSET_MAP = new Hashtable();
3904 LOCALE_TO_CHARSET_MAP.put("ar", "ISO-8859-6");
3905 LOCALE_TO_CHARSET_MAP.put("be", "ISO-8859-5");
3906 LOCALE_TO_CHARSET_MAP.put("bg", "ISO-8859-5");
3907 LOCALE_TO_CHARSET_MAP.put("ca", "ISO-8859-1");
3908 LOCALE_TO_CHARSET_MAP.put("cs", "ISO-8859-2");
3909 LOCALE_TO_CHARSET_MAP.put("da", "ISO-8859-1");
3910 LOCALE_TO_CHARSET_MAP.put("de", "ISO-8859-1");
3911 LOCALE_TO_CHARSET_MAP.put("el", "ISO-8859-7");
3912 LOCALE_TO_CHARSET_MAP.put("en", "ISO-8859-1");
3913 LOCALE_TO_CHARSET_MAP.put("es", "ISO-8859-1");
3914 LOCALE_TO_CHARSET_MAP.put("et", "ISO-8859-1");
3915 LOCALE_TO_CHARSET_MAP.put("fi", "ISO-8859-1");
3916 LOCALE_TO_CHARSET_MAP.put("fr", "ISO-8859-1");
3917 LOCALE_TO_CHARSET_MAP.put("hr", "ISO-8859-2");
3918 LOCALE_TO_CHARSET_MAP.put("hu", "ISO-8859-2");
3919 LOCALE_TO_CHARSET_MAP.put("is", "ISO-8859-1");
3920 LOCALE_TO_CHARSET_MAP.put("it", "ISO-8859-1");
3921 LOCALE_TO_CHARSET_MAP.put("iw", "ISO-8859-8");
3922 LOCALE_TO_CHARSET_MAP.put("ja", "Shift_JIS");
3923 LOCALE_TO_CHARSET_MAP.put("ko", "EUC-KR");
3924 LOCALE_TO_CHARSET_MAP.put("lt", "ISO-8859-2");
3925 LOCALE_TO_CHARSET_MAP.put("lv", "ISO-8859-2");
3926 LOCALE_TO_CHARSET_MAP.put("mk", "ISO-8859-5");
3927 LOCALE_TO_CHARSET_MAP.put("nl", "ISO-8859-1");
3928 LOCALE_TO_CHARSET_MAP.put("no", "ISO-8859-1");
3929 LOCALE_TO_CHARSET_MAP.put("pl", "ISO-8859-2");
3930 LOCALE_TO_CHARSET_MAP.put("pt", "ISO-8859-1");
3931 LOCALE_TO_CHARSET_MAP.put("ro", "ISO-8859-2");
3932 LOCALE_TO_CHARSET_MAP.put("ru", "ISO-8859-5");
3933 LOCALE_TO_CHARSET_MAP.put("sh", "ISO-8859-5");
3934 LOCALE_TO_CHARSET_MAP.put("sk", "ISO-8859-2");
3935 LOCALE_TO_CHARSET_MAP.put("sl", "ISO-8859-2");
3936 LOCALE_TO_CHARSET_MAP.put("sq", "ISO-8859-2");
3937 LOCALE_TO_CHARSET_MAP.put("sr", "ISO-8859-5");
3938 LOCALE_TO_CHARSET_MAP.put("sv", "ISO-8859-1");
3939 LOCALE_TO_CHARSET_MAP.put("tr", "ISO-8859-9");
3940 LOCALE_TO_CHARSET_MAP.put("uk", "ISO-8859-5");
3941 LOCALE_TO_CHARSET_MAP.put("zh", "GB2312");
3942 LOCALE_TO_CHARSET_MAP.put("zh_TW", "Big5");
3943 }
3944
3945 /***
3946 * Get the preferred charset for the given locale.
3947 *
3948 * @param locale the locale
3949 * @return the preferred charset or null if the locale is not
3950 * recognized.
3951 */
3952 public static String getCharset(Locale locale) {
3953
3954 String charset =
3955 (String) LOCALE_TO_CHARSET_MAP.get(locale.toString());
3956 if (charset != null) {
3957 return charset;
3958 }
3959
3960
3961 charset = (String) LOCALE_TO_CHARSET_MAP.get(locale.getLanguage());
3962 return charset;
3963 }
3964
3965 }
3966
3967 }
3968