GNU Classpath (0.20) | |
Frames | No Frames |
1: /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2: Copyright (C) 1998, 1999, 2001, 2002, 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package java.lang; 40: 41: import gnu.java.lang.CharData; 42: 43: import java.io.Serializable; 44: 45: /** 46: * Wrapper class for the primitive char data type. In addition, this class 47: * allows one to retrieve property information and perform transformations 48: * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0. 49: * java.lang.Character is designed to be very dynamic, and as such, it 50: * retrieves information on the Unicode character set from a separate 51: * database, gnu.java.lang.CharData, which can be easily upgraded. 52: * 53: * <p>For predicates, boundaries are used to describe 54: * the set of characters for which the method will return true. 55: * This syntax uses fairly normal regular expression notation. 56: * See 5.13 of the Unicode Standard, Version 3.0, for the 57: * boundary specification. 58: * 59: * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 60: * for more information on the Unicode Standard. 61: * 62: * @author Tom Tromey (tromey@cygnus.com) 63: * @author Paul N. Fisher 64: * @author Jochen Hoenicke 65: * @author Eric Blake (ebb9@email.byu.edu) 66: * @see CharData 67: * @since 1.0 68: * @status updated to 1.4 69: */ 70: public final class Character implements Serializable, Comparable 71: { 72: /** 73: * A subset of Unicode blocks. 74: * 75: * @author Paul N. Fisher 76: * @author Eric Blake (ebb9@email.byu.edu) 77: * @since 1.2 78: */ 79: public static class Subset 80: { 81: /** The name of the subset. */ 82: private final String name; 83: 84: /** 85: * Construct a new subset of characters. 86: * 87: * @param name the name of the subset 88: * @throws NullPointerException if name is null 89: */ 90: protected Subset(String name) 91: { 92: // Note that name.toString() is name, unless name was null. 93: this.name = name.toString(); 94: } 95: 96: /** 97: * Compares two Subsets for equality. This is <code>final</code>, and 98: * restricts the comparison on the <code>==</code> operator, so it returns 99: * true only for the same object. 100: * 101: * @param o the object to compare 102: * @return true if o is this 103: */ 104: public final boolean equals(Object o) 105: { 106: return o == this; 107: } 108: 109: /** 110: * Makes the original hashCode of Object final, to be consistent with 111: * equals. 112: * 113: * @return the hash code for this object 114: */ 115: public final int hashCode() 116: { 117: return super.hashCode(); 118: } 119: 120: /** 121: * Returns the name of the subset. 122: * 123: * @return the name 124: */ 125: public final String toString() 126: { 127: return name; 128: } 129: } // class Subset 130: 131: /** 132: * A family of character subsets in the Unicode specification. A character 133: * is in at most one of these blocks. 134: * 135: * This inner class was generated automatically from 136: * <code>doc/unicode/Block-3.txt</code>, by some perl scripts. 137: * This Unicode definition file can be found on the 138: * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 139: * JDK 1.4 uses Unicode version 3.0.0. 140: * 141: * @author scripts/unicode-blocks.pl (written by Eric Blake) 142: * @since 1.2 143: */ 144: public static final class UnicodeBlock extends Subset 145: { 146: /** The start of the subset. */ 147: private final char start; 148: 149: /** The end of the subset. */ 150: private final char end; 151: 152: /** 153: * Constructor for strictly defined blocks. 154: * 155: * @param start the start character of the range 156: * @param end the end character of the range 157: * @param name the block name 158: */ 159: private UnicodeBlock(char start, char end, String name) 160: { 161: super(name); 162: this.start = start; 163: this.end = end; 164: } 165: 166: /** 167: * Returns the Unicode character block which a character belongs to. 168: * 169: * @param ch the character to look up 170: * @return the set it belongs to, or null if it is not in one 171: */ 172: public static UnicodeBlock of(char ch) 173: { 174: // Special case, since SPECIALS contains two ranges. 175: if (ch == '\uFEFF') 176: return SPECIALS; 177: // Simple binary search for the correct block. 178: int low = 0; 179: int hi = sets.length - 1; 180: while (low <= hi) 181: { 182: int mid = (low + hi) >> 1; 183: UnicodeBlock b = sets[mid]; 184: if (ch < b.start) 185: hi = mid - 1; 186: else if (ch > b.end) 187: low = mid + 1; 188: else 189: return b; 190: } 191: return null; 192: } 193: 194: /** 195: * Basic Latin. 196: * '\u0000' - '\u007F'. 197: */ 198: public static final UnicodeBlock BASIC_LATIN 199: = new UnicodeBlock('\u0000', '\u007F', 200: "BASIC_LATIN"); 201: 202: /** 203: * Latin-1 Supplement. 204: * '\u0080' - '\u00FF'. 205: */ 206: public static final UnicodeBlock LATIN_1_SUPPLEMENT 207: = new UnicodeBlock('\u0080', '\u00FF', 208: "LATIN_1_SUPPLEMENT"); 209: 210: /** 211: * Latin Extended-A. 212: * '\u0100' - '\u017F'. 213: */ 214: public static final UnicodeBlock LATIN_EXTENDED_A 215: = new UnicodeBlock('\u0100', '\u017F', 216: "LATIN_EXTENDED_A"); 217: 218: /** 219: * Latin Extended-B. 220: * '\u0180' - '\u024F'. 221: */ 222: public static final UnicodeBlock LATIN_EXTENDED_B 223: = new UnicodeBlock('\u0180', '\u024F', 224: "LATIN_EXTENDED_B"); 225: 226: /** 227: * IPA Extensions. 228: * '\u0250' - '\u02AF'. 229: */ 230: public static final UnicodeBlock IPA_EXTENSIONS 231: = new UnicodeBlock('\u0250', '\u02AF', 232: "IPA_EXTENSIONS"); 233: 234: /** 235: * Spacing Modifier Letters. 236: * '\u02B0' - '\u02FF'. 237: */ 238: public static final UnicodeBlock SPACING_MODIFIER_LETTERS 239: = new UnicodeBlock('\u02B0', '\u02FF', 240: "SPACING_MODIFIER_LETTERS"); 241: 242: /** 243: * Combining Diacritical Marks. 244: * '\u0300' - '\u036F'. 245: */ 246: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 247: = new UnicodeBlock('\u0300', '\u036F', 248: "COMBINING_DIACRITICAL_MARKS"); 249: 250: /** 251: * Greek. 252: * '\u0370' - '\u03FF'. 253: */ 254: public static final UnicodeBlock GREEK 255: = new UnicodeBlock('\u0370', '\u03FF', 256: "GREEK"); 257: 258: /** 259: * Cyrillic. 260: * '\u0400' - '\u04FF'. 261: */ 262: public static final UnicodeBlock CYRILLIC 263: = new UnicodeBlock('\u0400', '\u04FF', 264: "CYRILLIC"); 265: 266: /** 267: * Armenian. 268: * '\u0530' - '\u058F'. 269: */ 270: public static final UnicodeBlock ARMENIAN 271: = new UnicodeBlock('\u0530', '\u058F', 272: "ARMENIAN"); 273: 274: /** 275: * Hebrew. 276: * '\u0590' - '\u05FF'. 277: */ 278: public static final UnicodeBlock HEBREW 279: = new UnicodeBlock('\u0590', '\u05FF', 280: "HEBREW"); 281: 282: /** 283: * Arabic. 284: * '\u0600' - '\u06FF'. 285: */ 286: public static final UnicodeBlock ARABIC 287: = new UnicodeBlock('\u0600', '\u06FF', 288: "ARABIC"); 289: 290: /** 291: * Syriac. 292: * '\u0700' - '\u074F'. 293: * @since 1.4 294: */ 295: public static final UnicodeBlock SYRIAC 296: = new UnicodeBlock('\u0700', '\u074F', 297: "SYRIAC"); 298: 299: /** 300: * Thaana. 301: * '\u0780' - '\u07BF'. 302: * @since 1.4 303: */ 304: public static final UnicodeBlock THAANA 305: = new UnicodeBlock('\u0780', '\u07BF', 306: "THAANA"); 307: 308: /** 309: * Devanagari. 310: * '\u0900' - '\u097F'. 311: */ 312: public static final UnicodeBlock DEVANAGARI 313: = new UnicodeBlock('\u0900', '\u097F', 314: "DEVANAGARI"); 315: 316: /** 317: * Bengali. 318: * '\u0980' - '\u09FF'. 319: */ 320: public static final UnicodeBlock BENGALI 321: = new UnicodeBlock('\u0980', '\u09FF', 322: "BENGALI"); 323: 324: /** 325: * Gurmukhi. 326: * '\u0A00' - '\u0A7F'. 327: */ 328: public static final UnicodeBlock GURMUKHI 329: = new UnicodeBlock('\u0A00', '\u0A7F', 330: "GURMUKHI"); 331: 332: /** 333: * Gujarati. 334: * '\u0A80' - '\u0AFF'. 335: */ 336: public static final UnicodeBlock GUJARATI 337: = new UnicodeBlock('\u0A80', '\u0AFF', 338: "GUJARATI"); 339: 340: /** 341: * Oriya. 342: * '\u0B00' - '\u0B7F'. 343: */ 344: public static final UnicodeBlock ORIYA 345: = new UnicodeBlock('\u0B00', '\u0B7F', 346: "ORIYA"); 347: 348: /** 349: * Tamil. 350: * '\u0B80' - '\u0BFF'. 351: */ 352: public static final UnicodeBlock TAMIL 353: = new UnicodeBlock('\u0B80', '\u0BFF', 354: "TAMIL"); 355: 356: /** 357: * Telugu. 358: * '\u0C00' - '\u0C7F'. 359: */ 360: public static final UnicodeBlock TELUGU 361: = new UnicodeBlock('\u0C00', '\u0C7F', 362: "TELUGU"); 363: 364: /** 365: * Kannada. 366: * '\u0C80' - '\u0CFF'. 367: */ 368: public static final UnicodeBlock KANNADA 369: = new UnicodeBlock('\u0C80', '\u0CFF', 370: "KANNADA"); 371: 372: /** 373: * Malayalam. 374: * '\u0D00' - '\u0D7F'. 375: */ 376: public static final UnicodeBlock MALAYALAM 377: = new UnicodeBlock('\u0D00', '\u0D7F', 378: "MALAYALAM"); 379: 380: /** 381: * Sinhala. 382: * '\u0D80' - '\u0DFF'. 383: * @since 1.4 384: */ 385: public static final UnicodeBlock SINHALA 386: = new UnicodeBlock('\u0D80', '\u0DFF', 387: "SINHALA"); 388: 389: /** 390: * Thai. 391: * '\u0E00' - '\u0E7F'. 392: */ 393: public static final UnicodeBlock THAI 394: = new UnicodeBlock('\u0E00', '\u0E7F', 395: "THAI"); 396: 397: /** 398: * Lao. 399: * '\u0E80' - '\u0EFF'. 400: */ 401: public static final UnicodeBlock LAO 402: = new UnicodeBlock('\u0E80', '\u0EFF', 403: "LAO"); 404: 405: /** 406: * Tibetan. 407: * '\u0F00' - '\u0FFF'. 408: */ 409: public static final UnicodeBlock TIBETAN 410: = new UnicodeBlock('\u0F00', '\u0FFF', 411: "TIBETAN"); 412: 413: /** 414: * Myanmar. 415: * '\u1000' - '\u109F'. 416: * @since 1.4 417: */ 418: public static final UnicodeBlock MYANMAR 419: = new UnicodeBlock('\u1000', '\u109F', 420: "MYANMAR"); 421: 422: /** 423: * Georgian. 424: * '\u10A0' - '\u10FF'. 425: */ 426: public static final UnicodeBlock GEORGIAN 427: = new UnicodeBlock('\u10A0', '\u10FF', 428: "GEORGIAN"); 429: 430: /** 431: * Hangul Jamo. 432: * '\u1100' - '\u11FF'. 433: */ 434: public static final UnicodeBlock HANGUL_JAMO 435: = new UnicodeBlock('\u1100', '\u11FF', 436: "HANGUL_JAMO"); 437: 438: /** 439: * Ethiopic. 440: * '\u1200' - '\u137F'. 441: * @since 1.4 442: */ 443: public static final UnicodeBlock ETHIOPIC 444: = new UnicodeBlock('\u1200', '\u137F', 445: "ETHIOPIC"); 446: 447: /** 448: * Cherokee. 449: * '\u13A0' - '\u13FF'. 450: * @since 1.4 451: */ 452: public static final UnicodeBlock CHEROKEE 453: = new UnicodeBlock('\u13A0', '\u13FF', 454: "CHEROKEE"); 455: 456: /** 457: * Unified Canadian Aboriginal Syllabics. 458: * '\u1400' - '\u167F'. 459: * @since 1.4 460: */ 461: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 462: = new UnicodeBlock('\u1400', '\u167F', 463: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS"); 464: 465: /** 466: * Ogham. 467: * '\u1680' - '\u169F'. 468: * @since 1.4 469: */ 470: public static final UnicodeBlock OGHAM 471: = new UnicodeBlock('\u1680', '\u169F', 472: "OGHAM"); 473: 474: /** 475: * Runic. 476: * '\u16A0' - '\u16FF'. 477: * @since 1.4 478: */ 479: public static final UnicodeBlock RUNIC 480: = new UnicodeBlock('\u16A0', '\u16FF', 481: "RUNIC"); 482: 483: /** 484: * Khmer. 485: * '\u1780' - '\u17FF'. 486: * @since 1.4 487: */ 488: public static final UnicodeBlock KHMER 489: = new UnicodeBlock('\u1780', '\u17FF', 490: "KHMER"); 491: 492: /** 493: * Mongolian. 494: * '\u1800' - '\u18AF'. 495: * @since 1.4 496: */ 497: public static final UnicodeBlock MONGOLIAN 498: = new UnicodeBlock('\u1800', '\u18AF', 499: "MONGOLIAN"); 500: 501: /** 502: * Latin Extended Additional. 503: * '\u1E00' - '\u1EFF'. 504: */ 505: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 506: = new UnicodeBlock('\u1E00', '\u1EFF', 507: "LATIN_EXTENDED_ADDITIONAL"); 508: 509: /** 510: * Greek Extended. 511: * '\u1F00' - '\u1FFF'. 512: */ 513: public static final UnicodeBlock GREEK_EXTENDED 514: = new UnicodeBlock('\u1F00', '\u1FFF', 515: "GREEK_EXTENDED"); 516: 517: /** 518: * General Punctuation. 519: * '\u2000' - '\u206F'. 520: */ 521: public static final UnicodeBlock GENERAL_PUNCTUATION 522: = new UnicodeBlock('\u2000', '\u206F', 523: "GENERAL_PUNCTUATION"); 524: 525: /** 526: * Superscripts and Subscripts. 527: * '\u2070' - '\u209F'. 528: */ 529: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 530: = new UnicodeBlock('\u2070', '\u209F', 531: "SUPERSCRIPTS_AND_SUBSCRIPTS"); 532: 533: /** 534: * Currency Symbols. 535: * '\u20A0' - '\u20CF'. 536: */ 537: public static final UnicodeBlock CURRENCY_SYMBOLS 538: = new UnicodeBlock('\u20A0', '\u20CF', 539: "CURRENCY_SYMBOLS"); 540: 541: /** 542: * Combining Marks for Symbols. 543: * '\u20D0' - '\u20FF'. 544: */ 545: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 546: = new UnicodeBlock('\u20D0', '\u20FF', 547: "COMBINING_MARKS_FOR_SYMBOLS"); 548: 549: /** 550: * Letterlike Symbols. 551: * '\u2100' - '\u214F'. 552: */ 553: public static final UnicodeBlock LETTERLIKE_SYMBOLS 554: = new UnicodeBlock('\u2100', '\u214F', 555: "LETTERLIKE_SYMBOLS"); 556: 557: /** 558: * Number Forms. 559: * '\u2150' - '\u218F'. 560: */ 561: public static final UnicodeBlock NUMBER_FORMS 562: = new UnicodeBlock('\u2150', '\u218F', 563: "NUMBER_FORMS"); 564: 565: /** 566: * Arrows. 567: * '\u2190' - '\u21FF'. 568: */ 569: public static final UnicodeBlock ARROWS 570: = new UnicodeBlock('\u2190', '\u21FF', 571: "ARROWS"); 572: 573: /** 574: * Mathematical Operators. 575: * '\u2200' - '\u22FF'. 576: */ 577: public static final UnicodeBlock MATHEMATICAL_OPERATORS 578: = new UnicodeBlock('\u2200', '\u22FF', 579: "MATHEMATICAL_OPERATORS"); 580: 581: /** 582: * Miscellaneous Technical. 583: * '\u2300' - '\u23FF'. 584: */ 585: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 586: = new UnicodeBlock('\u2300', '\u23FF', 587: "MISCELLANEOUS_TECHNICAL"); 588: 589: /** 590: * Control Pictures. 591: * '\u2400' - '\u243F'. 592: */ 593: public static final UnicodeBlock CONTROL_PICTURES 594: = new UnicodeBlock('\u2400', '\u243F', 595: "CONTROL_PICTURES"); 596: 597: /** 598: * Optical Character Recognition. 599: * '\u2440' - '\u245F'. 600: */ 601: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 602: = new UnicodeBlock('\u2440', '\u245F', 603: "OPTICAL_CHARACTER_RECOGNITION"); 604: 605: /** 606: * Enclosed Alphanumerics. 607: * '\u2460' - '\u24FF'. 608: */ 609: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 610: = new UnicodeBlock('\u2460', '\u24FF', 611: "ENCLOSED_ALPHANUMERICS"); 612: 613: /** 614: * Box Drawing. 615: * '\u2500' - '\u257F'. 616: */ 617: public static final UnicodeBlock BOX_DRAWING 618: = new UnicodeBlock('\u2500', '\u257F', 619: "BOX_DRAWING"); 620: 621: /** 622: * Block Elements. 623: * '\u2580' - '\u259F'. 624: */ 625: public static final UnicodeBlock BLOCK_ELEMENTS 626: = new UnicodeBlock('\u2580', '\u259F', 627: "BLOCK_ELEMENTS"); 628: 629: /** 630: * Geometric Shapes. 631: * '\u25A0' - '\u25FF'. 632: */ 633: public static final UnicodeBlock GEOMETRIC_SHAPES 634: = new UnicodeBlock('\u25A0', '\u25FF', 635: "GEOMETRIC_SHAPES"); 636: 637: /** 638: * Miscellaneous Symbols. 639: * '\u2600' - '\u26FF'. 640: */ 641: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 642: = new UnicodeBlock('\u2600', '\u26FF', 643: "MISCELLANEOUS_SYMBOLS"); 644: 645: /** 646: * Dingbats. 647: * '\u2700' - '\u27BF'. 648: */ 649: public static final UnicodeBlock DINGBATS 650: = new UnicodeBlock('\u2700', '\u27BF', 651: "DINGBATS"); 652: 653: /** 654: * Braille Patterns. 655: * '\u2800' - '\u28FF'. 656: * @since 1.4 657: */ 658: public static final UnicodeBlock BRAILLE_PATTERNS 659: = new UnicodeBlock('\u2800', '\u28FF', 660: "BRAILLE_PATTERNS"); 661: 662: /** 663: * CJK Radicals Supplement. 664: * '\u2E80' - '\u2EFF'. 665: * @since 1.4 666: */ 667: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 668: = new UnicodeBlock('\u2E80', '\u2EFF', 669: "CJK_RADICALS_SUPPLEMENT"); 670: 671: /** 672: * Kangxi Radicals. 673: * '\u2F00' - '\u2FDF'. 674: * @since 1.4 675: */ 676: public static final UnicodeBlock KANGXI_RADICALS 677: = new UnicodeBlock('\u2F00', '\u2FDF', 678: "KANGXI_RADICALS"); 679: 680: /** 681: * Ideographic Description Characters. 682: * '\u2FF0' - '\u2FFF'. 683: * @since 1.4 684: */ 685: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 686: = new UnicodeBlock('\u2FF0', '\u2FFF', 687: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS"); 688: 689: /** 690: * CJK Symbols and Punctuation. 691: * '\u3000' - '\u303F'. 692: */ 693: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 694: = new UnicodeBlock('\u3000', '\u303F', 695: "CJK_SYMBOLS_AND_PUNCTUATION"); 696: 697: /** 698: * Hiragana. 699: * '\u3040' - '\u309F'. 700: */ 701: public static final UnicodeBlock HIRAGANA 702: = new UnicodeBlock('\u3040', '\u309F', 703: "HIRAGANA"); 704: 705: /** 706: * Katakana. 707: * '\u30A0' - '\u30FF'. 708: */ 709: public static final UnicodeBlock KATAKANA 710: = new UnicodeBlock('\u30A0', '\u30FF', 711: "KATAKANA"); 712: 713: /** 714: * Bopomofo. 715: * '\u3100' - '\u312F'. 716: */ 717: public static final UnicodeBlock BOPOMOFO 718: = new UnicodeBlock('\u3100', '\u312F', 719: "BOPOMOFO"); 720: 721: /** 722: * Hangul Compatibility Jamo. 723: * '\u3130' - '\u318F'. 724: */ 725: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 726: = new UnicodeBlock('\u3130', '\u318F', 727: "HANGUL_COMPATIBILITY_JAMO"); 728: 729: /** 730: * Kanbun. 731: * '\u3190' - '\u319F'. 732: */ 733: public static final UnicodeBlock KANBUN 734: = new UnicodeBlock('\u3190', '\u319F', 735: "KANBUN"); 736: 737: /** 738: * Bopomofo Extended. 739: * '\u31A0' - '\u31BF'. 740: * @since 1.4 741: */ 742: public static final UnicodeBlock BOPOMOFO_EXTENDED 743: = new UnicodeBlock('\u31A0', '\u31BF', 744: "BOPOMOFO_EXTENDED"); 745: 746: /** 747: * Enclosed CJK Letters and Months. 748: * '\u3200' - '\u32FF'. 749: */ 750: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 751: = new UnicodeBlock('\u3200', '\u32FF', 752: "ENCLOSED_CJK_LETTERS_AND_MONTHS"); 753: 754: /** 755: * CJK Compatibility. 756: * '\u3300' - '\u33FF'. 757: */ 758: public static final UnicodeBlock CJK_COMPATIBILITY 759: = new UnicodeBlock('\u3300', '\u33FF', 760: "CJK_COMPATIBILITY"); 761: 762: /** 763: * CJK Unified Ideographs Extension A. 764: * '\u3400' - '\u4DB5'. 765: * @since 1.4 766: */ 767: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 768: = new UnicodeBlock('\u3400', '\u4DB5', 769: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A"); 770: 771: /** 772: * CJK Unified Ideographs. 773: * '\u4E00' - '\u9FFF'. 774: */ 775: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 776: = new UnicodeBlock('\u4E00', '\u9FFF', 777: "CJK_UNIFIED_IDEOGRAPHS"); 778: 779: /** 780: * Yi Syllables. 781: * '\uA000' - '\uA48F'. 782: * @since 1.4 783: */ 784: public static final UnicodeBlock YI_SYLLABLES 785: = new UnicodeBlock('\uA000', '\uA48F', 786: "YI_SYLLABLES"); 787: 788: /** 789: * Yi Radicals. 790: * '\uA490' - '\uA4CF'. 791: * @since 1.4 792: */ 793: public static final UnicodeBlock YI_RADICALS 794: = new UnicodeBlock('\uA490', '\uA4CF', 795: "YI_RADICALS"); 796: 797: /** 798: * Hangul Syllables. 799: * '\uAC00' - '\uD7A3'. 800: */ 801: public static final UnicodeBlock HANGUL_SYLLABLES 802: = new UnicodeBlock('\uAC00', '\uD7A3', 803: "HANGUL_SYLLABLES"); 804: 805: /** 806: * Surrogates Area. 807: * '\uD800' - '\uDFFF'. 808: */ 809: public static final UnicodeBlock SURROGATES_AREA 810: = new UnicodeBlock('\uD800', '\uDFFF', 811: "SURROGATES_AREA"); 812: 813: /** 814: * Private Use Area. 815: * '\uE000' - '\uF8FF'. 816: */ 817: public static final UnicodeBlock PRIVATE_USE_AREA 818: = new UnicodeBlock('\uE000', '\uF8FF', 819: "PRIVATE_USE_AREA"); 820: 821: /** 822: * CJK Compatibility Ideographs. 823: * '\uF900' - '\uFAFF'. 824: */ 825: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 826: = new UnicodeBlock('\uF900', '\uFAFF', 827: "CJK_COMPATIBILITY_IDEOGRAPHS"); 828: 829: /** 830: * Alphabetic Presentation Forms. 831: * '\uFB00' - '\uFB4F'. 832: */ 833: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 834: = new UnicodeBlock('\uFB00', '\uFB4F', 835: "ALPHABETIC_PRESENTATION_FORMS"); 836: 837: /** 838: * Arabic Presentation Forms-A. 839: * '\uFB50' - '\uFDFF'. 840: */ 841: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 842: = new UnicodeBlock('\uFB50', '\uFDFF', 843: "ARABIC_PRESENTATION_FORMS_A"); 844: 845: /** 846: * Combining Half Marks. 847: * '\uFE20' - '\uFE2F'. 848: */ 849: public static final UnicodeBlock COMBINING_HALF_MARKS 850: = new UnicodeBlock('\uFE20', '\uFE2F', 851: "COMBINING_HALF_MARKS"); 852: 853: /** 854: * CJK Compatibility Forms. 855: * '\uFE30' - '\uFE4F'. 856: */ 857: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 858: = new UnicodeBlock('\uFE30', '\uFE4F', 859: "CJK_COMPATIBILITY_FORMS"); 860: 861: /** 862: * Small Form Variants. 863: * '\uFE50' - '\uFE6F'. 864: */ 865: public static final UnicodeBlock SMALL_FORM_VARIANTS 866: = new UnicodeBlock('\uFE50', '\uFE6F', 867: "SMALL_FORM_VARIANTS"); 868: 869: /** 870: * Arabic Presentation Forms-B. 871: * '\uFE70' - '\uFEFE'. 872: */ 873: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 874: = new UnicodeBlock('\uFE70', '\uFEFE', 875: "ARABIC_PRESENTATION_FORMS_B"); 876: 877: /** 878: * Halfwidth and Fullwidth Forms. 879: * '\uFF00' - '\uFFEF'. 880: */ 881: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 882: = new UnicodeBlock('\uFF00', '\uFFEF', 883: "HALFWIDTH_AND_FULLWIDTH_FORMS"); 884: 885: /** 886: * Specials. 887: * '\uFEFF', '\uFFF0' - '\uFFFD'. 888: */ 889: public static final UnicodeBlock SPECIALS 890: = new UnicodeBlock('\uFFF0', '\uFFFD', 891: "SPECIALS"); 892: 893: /** 894: * The defined subsets. 895: */ 896: private static final UnicodeBlock sets[] = { 897: BASIC_LATIN, 898: LATIN_1_SUPPLEMENT, 899: LATIN_EXTENDED_A, 900: LATIN_EXTENDED_B, 901: IPA_EXTENSIONS, 902: SPACING_MODIFIER_LETTERS, 903: COMBINING_DIACRITICAL_MARKS, 904: GREEK, 905: CYRILLIC, 906: ARMENIAN, 907: HEBREW, 908: ARABIC, 909: SYRIAC, 910: THAANA, 911: DEVANAGARI, 912: BENGALI, 913: GURMUKHI, 914: GUJARATI, 915: ORIYA, 916: TAMIL, 917: TELUGU, 918: KANNADA, 919: MALAYALAM, 920: SINHALA, 921: THAI, 922: LAO, 923: TIBETAN, 924: MYANMAR, 925: GEORGIAN, 926: HANGUL_JAMO, 927: ETHIOPIC, 928: CHEROKEE, 929: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 930: OGHAM, 931: RUNIC, 932: KHMER, 933: MONGOLIAN, 934: LATIN_EXTENDED_ADDITIONAL, 935: GREEK_EXTENDED, 936: GENERAL_PUNCTUATION, 937: SUPERSCRIPTS_AND_SUBSCRIPTS, 938: CURRENCY_SYMBOLS, 939: COMBINING_MARKS_FOR_SYMBOLS, 940: LETTERLIKE_SYMBOLS, 941: NUMBER_FORMS, 942: ARROWS, 943: MATHEMATICAL_OPERATORS, 944: MISCELLANEOUS_TECHNICAL, 945: CONTROL_PICTURES, 946: OPTICAL_CHARACTER_RECOGNITION, 947: ENCLOSED_ALPHANUMERICS, 948: BOX_DRAWING, 949: BLOCK_ELEMENTS, 950: GEOMETRIC_SHAPES, 951: MISCELLANEOUS_SYMBOLS, 952: DINGBATS, 953: BRAILLE_PATTERNS, 954: CJK_RADICALS_SUPPLEMENT, 955: KANGXI_RADICALS, 956: IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 957: CJK_SYMBOLS_AND_PUNCTUATION, 958: HIRAGANA, 959: KATAKANA, 960: BOPOMOFO, 961: HANGUL_COMPATIBILITY_JAMO, 962: KANBUN, 963: BOPOMOFO_EXTENDED, 964: ENCLOSED_CJK_LETTERS_AND_MONTHS, 965: CJK_COMPATIBILITY, 966: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 967: CJK_UNIFIED_IDEOGRAPHS, 968: YI_SYLLABLES, 969: YI_RADICALS, 970: HANGUL_SYLLABLES, 971: SURROGATES_AREA, 972: PRIVATE_USE_AREA, 973: CJK_COMPATIBILITY_IDEOGRAPHS, 974: ALPHABETIC_PRESENTATION_FORMS, 975: ARABIC_PRESENTATION_FORMS_A, 976: COMBINING_HALF_MARKS, 977: CJK_COMPATIBILITY_FORMS, 978: SMALL_FORM_VARIANTS, 979: ARABIC_PRESENTATION_FORMS_B, 980: HALFWIDTH_AND_FULLWIDTH_FORMS, 981: SPECIALS, 982: }; 983: } // class UnicodeBlock 984: 985: /** 986: * The immutable value of this Character. 987: * 988: * @serial the value of this Character 989: */ 990: private final char value; 991: 992: /** 993: * Compatible with JDK 1.0+. 994: */ 995: private static final long serialVersionUID = 3786198910865385080L; 996: 997: /** 998: * Smallest value allowed for radix arguments in Java. This value is 2. 999: * 1000: * @see #digit(char, int) 1001: * @see #forDigit(int, int) 1002: * @see Integer#toString(int, int) 1003: * @see Integer#valueOf(String) 1004: */ 1005: public static final int MIN_RADIX = 2; 1006: 1007: /** 1008: * Largest value allowed for radix arguments in Java. This value is 36. 1009: * 1010: * @see #digit(char, int) 1011: * @see #forDigit(int, int) 1012: * @see Integer#toString(int, int) 1013: * @see Integer#valueOf(String) 1014: */ 1015: public static final int MAX_RADIX = 36; 1016: 1017: /** 1018: * The minimum value the char data type can hold. 1019: * This value is <code>'\\u0000'</code>. 1020: */ 1021: public static final char MIN_VALUE = '\u0000'; 1022: 1023: /** 1024: * The maximum value the char data type can hold. 1025: * This value is <code>'\\uFFFF'</code>. 1026: */ 1027: public static final char MAX_VALUE = '\uFFFF'; 1028: 1029: /** 1030: * Class object representing the primitive char data type. 1031: * 1032: * @since 1.1 1033: */ 1034: public static final Class TYPE = VMClassLoader.getPrimitiveClass('C'); 1035: 1036: /** 1037: * The number of bits needed to represent a <code>char</code>. 1038: * @since 1.5 1039: */ 1040: public static final int SIZE = 16; 1041: 1042: // This caches some Character values, and is used by boxing 1043: // conversions via valueOf(). We must cache at least 0..127; 1044: // this constant controls how much we actually cache. 1045: private static final int MAX_CACHE = 127; 1046: private static Character[] charCache = new Character[MAX_CACHE + 1]; 1047: 1048: /** 1049: * Lu = Letter, Uppercase (Informative). 1050: * 1051: * @since 1.1 1052: */ 1053: public static final byte UPPERCASE_LETTER = 1; 1054: 1055: /** 1056: * Ll = Letter, Lowercase (Informative). 1057: * 1058: * @since 1.1 1059: */ 1060: public static final byte LOWERCASE_LETTER = 2; 1061: 1062: /** 1063: * Lt = Letter, Titlecase (Informative). 1064: * 1065: * @since 1.1 1066: */ 1067: public static final byte TITLECASE_LETTER = 3; 1068: 1069: /** 1070: * Mn = Mark, Non-Spacing (Normative). 1071: * 1072: * @since 1.1 1073: */ 1074: public static final byte NON_SPACING_MARK = 6; 1075: 1076: /** 1077: * Mc = Mark, Spacing Combining (Normative). 1078: * 1079: * @since 1.1 1080: */ 1081: public static final byte COMBINING_SPACING_MARK = 8; 1082: 1083: /** 1084: * Me = Mark, Enclosing (Normative). 1085: * 1086: * @since 1.1 1087: */ 1088: public static final byte ENCLOSING_MARK = 7; 1089: 1090: /** 1091: * Nd = Number, Decimal Digit (Normative). 1092: * 1093: * @since 1.1 1094: */ 1095: public static final byte DECIMAL_DIGIT_NUMBER = 9; 1096: 1097: /** 1098: * Nl = Number, Letter (Normative). 1099: * 1100: * @since 1.1 1101: */ 1102: public static final byte LETTER_NUMBER = 10; 1103: 1104: /** 1105: * No = Number, Other (Normative). 1106: * 1107: * @since 1.1 1108: */ 1109: public static final byte OTHER_NUMBER = 11; 1110: 1111: /** 1112: * Zs = Separator, Space (Normative). 1113: * 1114: * @since 1.1 1115: */ 1116: public static final byte SPACE_SEPARATOR = 12; 1117: 1118: /** 1119: * Zl = Separator, Line (Normative). 1120: * 1121: * @since 1.1 1122: */ 1123: public static final byte LINE_SEPARATOR = 13; 1124: 1125: /** 1126: * Zp = Separator, Paragraph (Normative). 1127: * 1128: * @since 1.1 1129: */ 1130: public static final byte PARAGRAPH_SEPARATOR = 14; 1131: 1132: /** 1133: * Cc = Other, Control (Normative). 1134: * 1135: * @since 1.1 1136: */ 1137: public static final byte CONTROL = 15; 1138: 1139: /** 1140: * Cf = Other, Format (Normative). 1141: * 1142: * @since 1.1 1143: */ 1144: public static final byte FORMAT = 16; 1145: 1146: /** 1147: * Cs = Other, Surrogate (Normative). 1148: * 1149: * @since 1.1 1150: */ 1151: public static final byte SURROGATE = 19; 1152: 1153: /** 1154: * Co = Other, Private Use (Normative). 1155: * 1156: * @since 1.1 1157: */ 1158: public static final byte PRIVATE_USE = 18; 1159: 1160: /** 1161: * Cn = Other, Not Assigned (Normative). 1162: * 1163: * @since 1.1 1164: */ 1165: public static final byte UNASSIGNED = 0; 1166: 1167: /** 1168: * Lm = Letter, Modifier (Informative). 1169: * 1170: * @since 1.1 1171: */ 1172: public static final byte MODIFIER_LETTER = 4; 1173: 1174: /** 1175: * Lo = Letter, Other (Informative). 1176: * 1177: * @since 1.1 1178: */ 1179: public static final byte OTHER_LETTER = 5; 1180: 1181: /** 1182: * Pc = Punctuation, Connector (Informative). 1183: * 1184: * @since 1.1 1185: */ 1186: public static final byte CONNECTOR_PUNCTUATION = 23; 1187: 1188: /** 1189: * Pd = Punctuation, Dash (Informative). 1190: * 1191: * @since 1.1 1192: */ 1193: public static final byte DASH_PUNCTUATION = 20; 1194: 1195: /** 1196: * Ps = Punctuation, Open (Informative). 1197: * 1198: * @since 1.1 1199: */ 1200: public static final byte START_PUNCTUATION = 21; 1201: 1202: /** 1203: * Pe = Punctuation, Close (Informative). 1204: * 1205: * @since 1.1 1206: */ 1207: public static final byte END_PUNCTUATION = 22; 1208: 1209: /** 1210: * Pi = Punctuation, Initial Quote (Informative). 1211: * 1212: * @since 1.4 1213: */ 1214: public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 1215: 1216: /** 1217: * Pf = Punctuation, Final Quote (Informative). 1218: * 1219: * @since 1.4 1220: */ 1221: public static final byte FINAL_QUOTE_PUNCTUATION = 30; 1222: 1223: /** 1224: * Po = Punctuation, Other (Informative). 1225: * 1226: * @since 1.1 1227: */ 1228: public static final byte OTHER_PUNCTUATION = 24; 1229: 1230: /** 1231: * Sm = Symbol, Math (Informative). 1232: * 1233: * @since 1.1 1234: */ 1235: public static final byte MATH_SYMBOL = 25; 1236: 1237: /** 1238: * Sc = Symbol, Currency (Informative). 1239: * 1240: * @since 1.1 1241: */ 1242: public static final byte CURRENCY_SYMBOL = 26; 1243: 1244: /** 1245: * Sk = Symbol, Modifier (Informative). 1246: * 1247: * @since 1.1 1248: */ 1249: public static final byte MODIFIER_SYMBOL = 27; 1250: 1251: /** 1252: * So = Symbol, Other (Informative). 1253: * 1254: * @since 1.1 1255: */ 1256: public static final byte OTHER_SYMBOL = 28; 1257: 1258: /** 1259: * Undefined bidirectional character type. Undefined char values have 1260: * undefined directionality in the Unicode specification. 1261: * 1262: * @since 1.4 1263: */ 1264: public static final byte DIRECTIONALITY_UNDEFINED = -1; 1265: 1266: /** 1267: * Strong bidirectional character type "L". 1268: * 1269: * @since 1.4 1270: */ 1271: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 1272: 1273: /** 1274: * Strong bidirectional character type "R". 1275: * 1276: * @since 1.4 1277: */ 1278: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 1279: 1280: /** 1281: * Strong bidirectional character type "AL". 1282: * 1283: * @since 1.4 1284: */ 1285: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 1286: 1287: /** 1288: * Weak bidirectional character type "EN". 1289: * 1290: * @since 1.4 1291: */ 1292: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 1293: 1294: /** 1295: * Weak bidirectional character type "ES". 1296: * 1297: * @since 1.4 1298: */ 1299: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 1300: 1301: /** 1302: * Weak bidirectional character type "ET". 1303: * 1304: * @since 1.4 1305: */ 1306: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 1307: 1308: /** 1309: * Weak bidirectional character type "AN". 1310: * 1311: * @since 1.4 1312: */ 1313: public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 1314: 1315: /** 1316: * Weak bidirectional character type "CS". 1317: * 1318: * @since 1.4 1319: */ 1320: public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 1321: 1322: /** 1323: * Weak bidirectional character type "NSM". 1324: * 1325: * @since 1.4 1326: */ 1327: public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 1328: 1329: /** 1330: * Weak bidirectional character type "BN". 1331: * 1332: * @since 1.4 1333: */ 1334: public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 1335: 1336: /** 1337: * Neutral bidirectional character type "B". 1338: * 1339: * @since 1.4 1340: */ 1341: public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 1342: 1343: /** 1344: * Neutral bidirectional character type "S". 1345: * 1346: * @since 1.4 1347: */ 1348: public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 1349: 1350: /** 1351: * Strong bidirectional character type "WS". 1352: * 1353: * @since 1.4 1354: */ 1355: public static final byte DIRECTIONALITY_WHITESPACE = 12; 1356: 1357: /** 1358: * Neutral bidirectional character type "ON". 1359: * 1360: * @since 1.4 1361: */ 1362: public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 1363: 1364: /** 1365: * Strong bidirectional character type "LRE". 1366: * 1367: * @since 1.4 1368: */ 1369: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 1370: 1371: /** 1372: * Strong bidirectional character type "LRO". 1373: * 1374: * @since 1.4 1375: */ 1376: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 1377: 1378: /** 1379: * Strong bidirectional character type "RLE". 1380: * 1381: * @since 1.4 1382: */ 1383: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 1384: 1385: /** 1386: * Strong bidirectional character type "RLO". 1387: * 1388: * @since 1.4 1389: */ 1390: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 1391: 1392: /** 1393: * Weak bidirectional character type "PDF". 1394: * 1395: * @since 1.4 1396: */ 1397: public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 1398: 1399: /** 1400: * Stores unicode block offset lookup table. Exploit package visibility of 1401: * String.value to avoid copying the array. 1402: * @see #readChar(char) 1403: * @see CharData#BLOCKS 1404: */ 1405: private static final char[] blocks = String.zeroBasedStringValue(CharData.BLOCKS); 1406: 1407: /** 1408: * Stores unicode attribute offset lookup table. Exploit package visibility 1409: * of String.value to avoid copying the array. 1410: * @see CharData#DATA 1411: */ 1412: private static final char[] data = String.zeroBasedStringValue(CharData.DATA); 1413: 1414: /** 1415: * Stores unicode numeric value attribute table. Exploit package visibility 1416: * of String.value to avoid copying the array. 1417: * @see CharData#NUM_VALUE 1418: */ 1419: private static final char[] numValue 1420: = String.zeroBasedStringValue(CharData.NUM_VALUE); 1421: 1422: /** 1423: * Stores unicode uppercase attribute table. Exploit package visibility 1424: * of String.value to avoid copying the array. 1425: * @see CharData#UPPER 1426: */ 1427: private static final char[] upper = String.zeroBasedStringValue(CharData.UPPER); 1428: 1429: /** 1430: * Stores unicode lowercase attribute table. Exploit package visibility 1431: * of String.value to avoid copying the array. 1432: * @see CharData#LOWER 1433: */ 1434: private static final char[] lower = String.zeroBasedStringValue(CharData.LOWER); 1435: 1436: /** 1437: * Stores unicode direction attribute table. Exploit package visibility 1438: * of String.value to avoid copying the array. 1439: * @see CharData#DIRECTION 1440: */ 1441: // Package visible for use by String. 1442: static final char[] direction = String.zeroBasedStringValue(CharData.DIRECTION); 1443: 1444: /** 1445: * Stores unicode titlecase table. Exploit package visibility of 1446: * String.value to avoid copying the array. 1447: * @see CharData#TITLE 1448: */ 1449: private static final char[] title = String.zeroBasedStringValue(CharData.TITLE); 1450: 1451: /** 1452: * Mask for grabbing the type out of the contents of data. 1453: * @see CharData#DATA 1454: */ 1455: private static final int TYPE_MASK = 0x1F; 1456: 1457: /** 1458: * Mask for grabbing the non-breaking space flag out of the contents of 1459: * data. 1460: * @see CharData#DATA 1461: */ 1462: private static final int NO_BREAK_MASK = 0x20; 1463: 1464: /** 1465: * Mask for grabbing the mirrored directionality flag out of the contents 1466: * of data. 1467: * @see CharData#DATA 1468: */ 1469: private static final int MIRROR_MASK = 0x40; 1470: 1471: /** 1472: * Min value for supplementary code point. 1473: * 1474: * @since 1.5 1475: */ 1476: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 1477: 1478: /** 1479: * Min value for code point. 1480: * 1481: * @since 1.5 1482: */ 1483: public static final int MIN_CODE_POINT = 0; 1484: 1485: 1486: /** 1487: * Max value for code point. 1488: * 1489: * @since 1.5 1490: */ 1491: public static final int MAX_CODE_POINT = 0x010ffff; 1492: 1493: 1494: /** 1495: * Minimum high surrogate code in UTF-16 encoding. 1496: * 1497: * @since 1.5 1498: */ 1499: public static final char MIN_HIGH_SURROGATE = '\ud800'; 1500: 1501: /** 1502: * Maximum high surrogate code in UTF-16 encoding. 1503: * 1504: * @since 1.5 1505: */ 1506: public static final char MAX_HIGH_SURROGATE = '\udbff'; 1507: 1508: /** 1509: * Minimum low surrogate code in UTF-16 encoding. 1510: * 1511: * @since 1.5 1512: */ 1513: public static final char MIN_LOW_SURROGATE = '\udc00'; 1514: 1515: /** 1516: * Maximum low surrogate code in UTF-16 encoding. 1517: * 1518: * @since 1.5 1519: */ 1520: public static final char MAX_LOW_SURROGATE = '\udfff'; 1521: 1522: /** 1523: * Minimum surrogate code in UTF-16 encoding. 1524: * 1525: * @since 1.5 1526: */ 1527: public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE; 1528: 1529: /** 1530: * Maximum low surrogate code in UTF-16 encoding. 1531: * 1532: * @since 1.5 1533: */ 1534: public static final char MAX_SURROGATE = MAX_LOW_SURROGATE; 1535: 1536: /** 1537: * Grabs an attribute offset from the Unicode attribute database. The lower 1538: * 5 bits are the character type, the next 2 bits are flags, and the top 1539: * 9 bits are the offset into the attribute tables. 1540: * 1541: * @param ch the character to look up 1542: * @return the character's attribute offset and type 1543: * @see #TYPE_MASK 1544: * @see #NO_BREAK_MASK 1545: * @see #MIRROR_MASK 1546: * @see CharData#DATA 1547: * @see CharData#SHIFT 1548: */ 1549: // Package visible for use in String. 1550: static char readChar(char ch) 1551: { 1552: // Perform 16-bit addition to find the correct entry in data. 1553: return data[(char) (blocks[ch >> CharData.SHIFT] + ch)]; 1554: } 1555: 1556: /** 1557: * Wraps up a character. 1558: * 1559: * @param value the character to wrap 1560: */ 1561: public Character(char value) 1562: { 1563: this.value = value; 1564: } 1565: 1566: /** 1567: * Returns the character which has been wrapped by this class. 1568: * 1569: * @return the character wrapped 1570: */ 1571: public char charValue() 1572: { 1573: return value; 1574: } 1575: 1576: /** 1577: * Returns the numerical value (unsigned) of the wrapped character. 1578: * Range of returned values: 0x0000-0xFFFF. 1579: * 1580: * @return the value of the wrapped character 1581: */ 1582: public int hashCode() 1583: { 1584: return value; 1585: } 1586: 1587: /** 1588: * Determines if an object is equal to this object. This is only true for 1589: * another Character object wrapping the same value. 1590: * 1591: * @param o object to compare 1592: * @return true if o is a Character with the same value 1593: */ 1594: public boolean equals(Object o) 1595: { 1596: return o instanceof Character && value == ((Character) o).value; 1597: } 1598: 1599: /** 1600: * Converts the wrapped character into a String. 1601: * 1602: * @return a String containing one character -- the wrapped character 1603: * of this instance 1604: */ 1605: public String toString() 1606: { 1607: // Package constructor avoids an array copy. 1608: return new String(new char[] { value }, 0, 1, true); 1609: } 1610: 1611: /** 1612: * Returns a String of length 1 representing the specified character. 1613: * 1614: * @param ch the character to convert 1615: * @return a String containing the character 1616: * @since 1.4 1617: */ 1618: public static String toString(char ch) 1619: { 1620: // Package constructor avoids an array copy. 1621: return new String(new char[] { ch }, 0, 1, true); 1622: } 1623: 1624: /** 1625: * Determines if a character is a Unicode lowercase letter. For example, 1626: * <code>'a'</code> is lowercase. 1627: * <br> 1628: * lowercase = [Ll] 1629: * 1630: * @param ch character to test 1631: * @return true if ch is a Unicode lowercase letter, else false 1632: * @see #isUpperCase(char) 1633: * @see #isTitleCase(char) 1634: * @see #toLowerCase(char) 1635: * @see #getType(char) 1636: */ 1637: public static boolean isLowerCase(char ch) 1638: { 1639: return getType(ch) == LOWERCASE_LETTER; 1640: } 1641: 1642: /** 1643: * Determines if a character is a Unicode uppercase letter. For example, 1644: * <code>'A'</code> is uppercase. 1645: * <br> 1646: * uppercase = [Lu] 1647: * 1648: * @param ch character to test 1649: * @return true if ch is a Unicode uppercase letter, else false 1650: * @see #isLowerCase(char) 1651: * @see #isTitleCase(char) 1652: * @see #toUpperCase(char) 1653: * @see #getType(char) 1654: */ 1655: public static boolean isUpperCase(char ch) 1656: { 1657: return getType(ch) == UPPERCASE_LETTER; 1658: } 1659: 1660: /** 1661: * Determines if a character is a Unicode titlecase letter. For example, 1662: * the character "Lj" (Latin capital L with small letter j) is titlecase. 1663: * <br> 1664: * titlecase = [Lt] 1665: * 1666: * @param ch character to test 1667: * @return true if ch is a Unicode titlecase letter, else false 1668: * @see #isLowerCase(char) 1669: * @see #isUpperCase(char) 1670: * @see #toTitleCase(char) 1671: * @see #getType(char) 1672: */ 1673: public static boolean isTitleCase(char ch) 1674: { 1675: return getType(ch) == TITLECASE_LETTER; 1676: } 1677: 1678: /** 1679: * Determines if a character is a Unicode decimal digit. For example, 1680: * <code>'0'</code> is a digit. 1681: * <br> 1682: * Unicode decimal digit = [Nd] 1683: * 1684: * @param ch character to test 1685: * @return true if ch is a Unicode decimal digit, else false 1686: * @see #digit(char, int) 1687: * @see #forDigit(int, int) 1688: * @see #getType(char) 1689: */ 1690: public static boolean isDigit(char ch) 1691: { 1692: return getType(ch) == DECIMAL_DIGIT_NUMBER; 1693: } 1694: 1695: /** 1696: * Determines if a character is part of the Unicode Standard. This is an 1697: * evolving standard, but covers every character in the data file. 1698: * <br> 1699: * defined = not [Cn] 1700: * 1701: * @param ch character to test 1702: * @return true if ch is a Unicode character, else false 1703: * @see #isDigit(char) 1704: * @see #isLetter(char) 1705: * @see #isLetterOrDigit(char) 1706: * @see #isLowerCase(char) 1707: * @see #isTitleCase(char) 1708: * @see #isUpperCase(char) 1709: */ 1710: public static boolean isDefined(char ch) 1711: { 1712: return getType(ch) != UNASSIGNED; 1713: } 1714: 1715: /** 1716: * Determines if a character is a Unicode letter. Not all letters have case, 1717: * so this may return true when isLowerCase and isUpperCase return false. 1718: * <br> 1719: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 1720: * 1721: * @param ch character to test 1722: * @return true if ch is a Unicode letter, else false 1723: * @see #isDigit(char) 1724: * @see #isJavaIdentifierStart(char) 1725: * @see #isJavaLetter(char) 1726: * @see #isJavaLetterOrDigit(char) 1727: * @see #isLetterOrDigit(char) 1728: * @see #isLowerCase(char) 1729: * @see #isTitleCase(char) 1730: * @see #isUnicodeIdentifierStart(char) 1731: * @see #isUpperCase(char) 1732: */ 1733: public static boolean isLetter(char ch) 1734: { 1735: return ((1 << getType(ch)) 1736: & ((1 << UPPERCASE_LETTER) 1737: | (1 << LOWERCASE_LETTER) 1738: | (1 << TITLECASE_LETTER) 1739: | (1 << MODIFIER_LETTER) 1740: | (1 << OTHER_LETTER))) != 0; 1741: } 1742: 1743: /** 1744: * Determines if a character is a Unicode letter or a Unicode digit. This 1745: * is the combination of isLetter and isDigit. 1746: * <br> 1747: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 1748: * 1749: * @param ch character to test 1750: * @return true if ch is a Unicode letter or a Unicode digit, else false 1751: * @see #isDigit(char) 1752: * @see #isJavaIdentifierPart(char) 1753: * @see #isJavaLetter(char) 1754: * @see #isJavaLetterOrDigit(char) 1755: * @see #isLetter(char) 1756: * @see #isUnicodeIdentifierPart(char) 1757: */ 1758: public static boolean isLetterOrDigit(char ch) 1759: { 1760: return ((1 << getType(ch)) 1761: & ((1 << UPPERCASE_LETTER) 1762: | (1 << LOWERCASE_LETTER) 1763: | (1 << TITLECASE_LETTER) 1764: | (1 << MODIFIER_LETTER) 1765: | (1 << OTHER_LETTER) 1766: | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 1767: } 1768: 1769: /** 1770: * Determines if a character can start a Java identifier. This is the 1771: * combination of isLetter, any character where getType returns 1772: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 1773: * (like '_'). 1774: * 1775: * @param ch character to test 1776: * @return true if ch can start a Java identifier, else false 1777: * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 1778: * @see #isJavaLetterOrDigit(char) 1779: * @see #isJavaIdentifierStart(char) 1780: * @see #isJavaIdentifierPart(char) 1781: * @see #isLetter(char) 1782: * @see #isLetterOrDigit(char) 1783: * @see #isUnicodeIdentifierStart(char) 1784: */ 1785: public static boolean isJavaLetter(char ch) 1786: { 1787: return isJavaIdentifierStart(ch); 1788: } 1789: 1790: /** 1791: * Determines if a character can follow the first letter in 1792: * a Java identifier. This is the combination of isJavaLetter (isLetter, 1793: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 1794: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 1795: * or isIdentifierIgnorable. 1796: * 1797: * @param ch character to test 1798: * @return true if ch can follow the first letter in a Java identifier 1799: * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 1800: * @see #isJavaLetter(char) 1801: * @see #isJavaIdentifierStart(char) 1802: * @see #isJavaIdentifierPart(char) 1803: * @see #isLetter(char) 1804: * @see #isLetterOrDigit(char) 1805: * @see #isUnicodeIdentifierPart(char) 1806: * @see #isIdentifierIgnorable(char) 1807: */ 1808: public static boolean isJavaLetterOrDigit(char ch) 1809: { 1810: return isJavaIdentifierPart(ch); 1811: } 1812: 1813: /** 1814: * Determines if a character can start a Java identifier. This is the 1815: * combination of isLetter, any character where getType returns 1816: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 1817: * (like '_'). 1818: * <br> 1819: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 1820: * 1821: * @param ch character to test 1822: * @return true if ch can start a Java identifier, else false 1823: * @see #isJavaIdentifierPart(char) 1824: * @see #isLetter(char) 1825: * @see #isUnicodeIdentifierStart(char) 1826: * @since 1.1 1827: */ 1828: public static boolean isJavaIdentifierStart(char ch) 1829: { 1830: return ((1 << getType(ch)) 1831: & ((1 << UPPERCASE_LETTER) 1832: | (1 << LOWERCASE_LETTER) 1833: | (1 << TITLECASE_LETTER) 1834: | (1 << MODIFIER_LETTER) 1835: | (1 << OTHER_LETTER) 1836: | (1 << LETTER_NUMBER) 1837: | (1 << CURRENCY_SYMBOL) 1838: | (1 << CONNECTOR_PUNCTUATION))) != 0; 1839: } 1840: 1841: /** 1842: * Determines if a character can follow the first letter in 1843: * a Java identifier. This is the combination of isJavaLetter (isLetter, 1844: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 1845: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 1846: * or isIdentifierIgnorable. 1847: * <br> 1848: * Java identifier extender = 1849: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 1850: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 1851: * 1852: * @param ch character to test 1853: * @return true if ch can follow the first letter in a Java identifier 1854: * @see #isIdentifierIgnorable(char) 1855: * @see #isJavaIdentifierStart(char) 1856: * @see #isLetterOrDigit(char) 1857: * @see #isUnicodeIdentifierPart(char) 1858: * @since 1.1 1859: */ 1860: public static boolean isJavaIdentifierPart(char ch) 1861: { 1862: int category = getType(ch); 1863: return ((1 << category) 1864: & ((1 << UPPERCASE_LETTER) 1865: | (1 << LOWERCASE_LETTER) 1866: | (1 << TITLECASE_LETTER) 1867: | (1 << MODIFIER_LETTER) 1868: | (1 << OTHER_LETTER) 1869: | (1 << NON_SPACING_MARK) 1870: | (1 << COMBINING_SPACING_MARK) 1871: | (1 << DECIMAL_DIGIT_NUMBER) 1872: | (1 << LETTER_NUMBER) 1873: | (1 << CURRENCY_SYMBOL) 1874: | (1 << CONNECTOR_PUNCTUATION) 1875: | (1 << FORMAT))) != 0 1876: || (category == CONTROL && isIdentifierIgnorable(ch)); 1877: } 1878: 1879: /** 1880: * Determines if a character can start a Unicode identifier. Only 1881: * letters can start a Unicode identifier, but this includes characters 1882: * in LETTER_NUMBER. 1883: * <br> 1884: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 1885: * 1886: * @param ch character to test 1887: * @return true if ch can start a Unicode identifier, else false 1888: * @see #isJavaIdentifierStart(char) 1889: * @see #isLetter(char) 1890: * @see #isUnicodeIdentifierPart(char) 1891: * @since 1.1 1892: */ 1893: public static boolean isUnicodeIdentifierStart(char ch) 1894: { 1895: return ((1 << getType(ch)) 1896: & ((1 << UPPERCASE_LETTER) 1897: | (1 << LOWERCASE_LETTER) 1898: | (1 << TITLECASE_LETTER) 1899: | (1 << MODIFIER_LETTER) 1900: | (1 << OTHER_LETTER) 1901: | (1 << LETTER_NUMBER))) != 0; 1902: } 1903: 1904: /** 1905: * Determines if a character can follow the first letter in 1906: * a Unicode identifier. This includes letters, connecting punctuation, 1907: * digits, numeric letters, combining marks, non-spacing marks, and 1908: * isIdentifierIgnorable. 1909: * <br> 1910: * Unicode identifier extender = 1911: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 1912: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 1913: * 1914: * @param ch character to test 1915: * @return true if ch can follow the first letter in a Unicode identifier 1916: * @see #isIdentifierIgnorable(char) 1917: * @see #isJavaIdentifierPart(char) 1918: * @see #isLetterOrDigit(char) 1919: * @see #isUnicodeIdentifierStart(char) 1920: * @since 1.1 1921: */ 1922: public static boolean isUnicodeIdentifierPart(char ch) 1923: { 1924: int category = getType(ch); 1925: return ((1 << category) 1926: & ((1 << UPPERCASE_LETTER) 1927: | (1 << LOWERCASE_LETTER) 1928: | (1 << TITLECASE_LETTER) 1929: | (1 << MODIFIER_LETTER) 1930: | (1 << OTHER_LETTER) 1931: | (1 << NON_SPACING_MARK) 1932: | (1 << COMBINING_SPACING_MARK) 1933: | (1 << DECIMAL_DIGIT_NUMBER) 1934: | (1 << LETTER_NUMBER) 1935: | (1 << CONNECTOR_PUNCTUATION) 1936: | (1 << FORMAT))) != 0 1937: || (category == CONTROL && isIdentifierIgnorable(ch)); 1938: } 1939: 1940: /** 1941: * Determines if a character is ignorable in a Unicode identifier. This 1942: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 1943: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 1944: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 1945: * <code>'\u009F'</code>), and FORMAT characters. 1946: * <br> 1947: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 1948: * |U+007F-U+009F 1949: * 1950: * @param ch character to test 1951: * @return true if ch is ignorable in a Unicode or Java identifier 1952: * @see #isJavaIdentifierPart(char) 1953: * @see #isUnicodeIdentifierPart(char) 1954: * @since 1.1 1955: */ 1956: public static boolean isIdentifierIgnorable(char ch) 1957: { 1958: return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F' 1959: || (ch <= '\u001B' && ch >= '\u000E'))) 1960: || getType(ch) == FORMAT; 1961: } 1962: 1963: /** 1964: * Converts a Unicode character into its lowercase equivalent mapping. 1965: * If a mapping does not exist, then the character passed is returned. 1966: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 1967: * 1968: * @param ch character to convert to lowercase 1969: * @return lowercase mapping of ch, or ch if lowercase mapping does 1970: * not exist 1971: * @see #isLowerCase(char) 1972: * @see #isUpperCase(char) 1973: * @see #toTitleCase(char) 1974: * @see #toUpperCase(char) 1975: */ 1976: public static char toLowerCase(char ch) 1977: { 1978: // Signedness doesn't matter, as result is cast back to char. 1979: return (char) (ch + lower[readChar(ch) >> 7]); 1980: } 1981: 1982: /** 1983: * Converts a Unicode character into its uppercase equivalent mapping. 1984: * If a mapping does not exist, then the character passed is returned. 1985: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 1986: * 1987: * @param ch character to convert to uppercase 1988: * @return uppercase mapping of ch, or ch if uppercase mapping does 1989: * not exist 1990: * @see #isLowerCase(char) 1991: * @see #isUpperCase(char) 1992: * @see #toLowerCase(char) 1993: * @see #toTitleCase(char) 1994: */ 1995: public static char toUpperCase(char ch) 1996: { 1997: // Signedness doesn't matter, as result is cast back to char. 1998: return (char) (ch + upper[readChar(ch) >> 7]); 1999: } 2000: 2001: /** 2002: * Converts a Unicode character into its titlecase equivalent mapping. 2003: * If a mapping does not exist, then the character passed is returned. 2004: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 2005: * 2006: * @param ch character to convert to titlecase 2007: * @return titlecase mapping of ch, or ch if titlecase mapping does 2008: * not exist 2009: * @see #isTitleCase(char) 2010: * @see #toLowerCase(char) 2011: * @see #toUpperCase(char) 2012: */ 2013: public static char toTitleCase(char ch) 2014: { 2015: // As title is short, it doesn't hurt to exhaustively iterate over it. 2016: for (int i = title.length - 2; i >= 0; i -= 2) 2017: if (title[i] == ch) 2018: return title[i + 1]; 2019: return toUpperCase(ch); 2020: } 2021: 2022: /** 2023: * Converts a character into a digit of the specified radix. If the radix 2024: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 2025: * exceeds the radix, or if ch is not a decimal digit or in the case 2026: * insensitive set of 'a'-'z', the result is -1. 2027: * <br> 2028: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 2029: * |U+FF21-U+FF3A|U+FF41-U+FF5A 2030: * 2031: * @param ch character to convert into a digit 2032: * @param radix radix in which ch is a digit 2033: * @return digit which ch represents in radix, or -1 not a valid digit 2034: * @see #MIN_RADIX 2035: * @see #MAX_RADIX 2036: * @see #forDigit(int, int) 2037: * @see #isDigit(char) 2038: * @see #getNumericValue(char) 2039: */ 2040: public static int digit(char ch, int radix) 2041: { 2042: if (radix < MIN_RADIX || radix > MAX_RADIX) 2043: return -1; 2044: char attr = readChar(ch); 2045: if (((1 << (attr & TYPE_MASK)) 2046: & ((1 << UPPERCASE_LETTER) 2047: | (1 << LOWERCASE_LETTER) 2048: | (1 << DECIMAL_DIGIT_NUMBER))) != 0) 2049: { 2050: // Signedness doesn't matter; 0xffff vs. -1 are both rejected. 2051: int digit = numValue[attr >> 7]; 2052: return (digit < radix) ? digit : -1; 2053: } 2054: return -1; 2055: } 2056: 2057: /** 2058: * Returns the Unicode numeric value property of a character. For example, 2059: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 2060: * 2061: * <p>This method also returns values for the letters A through Z, (not 2062: * specified by Unicode), in these ranges: <code>'\u0041'</code> 2063: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 2064: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 2065: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 2066: * <code>'\uFF5A'</code> (full width variants). 2067: * 2068: * <p>If the character lacks a numeric value property, -1 is returned. 2069: * If the character has a numeric value property which is not representable 2070: * as a nonnegative integer, such as a fraction, -2 is returned. 2071: * 2072: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 2073: * |U+FF21-U+FF3A|U+FF41-U+FF5A 2074: * 2075: * @param ch character from which the numeric value property will 2076: * be retrieved 2077: * @return the numeric value property of ch, or -1 if it does not exist, or 2078: * -2 if it is not representable as a nonnegative integer 2079: * @see #forDigit(int, int) 2080: * @see #digit(char, int) 2081: * @see #isDigit(char) 2082: * @since 1.1 2083: */ 2084: public static int getNumericValue(char ch) 2085: { 2086: // Treat numValue as signed. 2087: return (short) numValue[readChar(ch) >> 7]; 2088: } 2089: 2090: /** 2091: * Determines if a character is a ISO-LATIN-1 space. This is only the five 2092: * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 2093: * <code>'\r'</code>, and <code>' '</code>. 2094: * <br> 2095: * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 2096: * 2097: * @param ch character to test 2098: * @return true if ch is a space, else false 2099: * @deprecated Replaced by {@link #isWhitespace(char)} 2100: * @see #isSpaceChar(char) 2101: * @see #isWhitespace(char) 2102: */ 2103: public static boolean isSpace(char ch) 2104: { 2105: // Performing the subtraction up front alleviates need to compare longs. 2106: return ch-- <= ' ' && ((1 << ch) 2107: & ((1 << (' ' - 1)) 2108: | (1 << ('\t' - 1)) 2109: | (1 << ('\n' - 1)) 2110: | (1 << ('\r' - 1)) 2111: | (1 << ('\f' - 1)))) != 0; 2112: } 2113: 2114: /** 2115: * Determines if a character is a Unicode space character. This includes 2116: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 2117: * <br> 2118: * Unicode space = [Zs]|[Zp]|[Zl] 2119: * 2120: * @param ch character to test 2121: * @return true if ch is a Unicode space, else false 2122: * @see #isWhitespace(char) 2123: * @since 1.1 2124: */ 2125: public static boolean isSpaceChar(char ch) 2126: { 2127: return ((1 << getType(ch)) 2128: & ((1 << SPACE_SEPARATOR) 2129: | (1 << LINE_SEPARATOR) 2130: | (1 << PARAGRAPH_SEPARATOR))) != 0; 2131: } 2132: 2133: /** 2134: * Determines if a character is Java whitespace. This includes Unicode 2135: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 2136: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 2137: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 2138: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 2139: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 2140: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 2141: * and <code>'\u001F'</code>. 2142: * <br> 2143: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 2144: * 2145: * @param ch character to test 2146: * @return true if ch is Java whitespace, else false 2147: * @see #isSpaceChar(char) 2148: * @since 1.1 2149: */ 2150: public static boolean isWhitespace(char ch) 2151: { 2152: int attr = readChar(ch); 2153: return ((((1 << (attr & TYPE_MASK)) 2154: & ((1 << SPACE_SEPARATOR) 2155: | (1 << LINE_SEPARATOR) 2156: | (1 << PARAGRAPH_SEPARATOR))) != 0) 2157: && (attr & NO_BREAK_MASK) == 0) 2158: || (ch <= '\u001F' && ((1 << ch) 2159: & ((1 << '\t') 2160: | (1 << '\n') 2161: | (1 << '\u000B') 2162: | (1 << '\u000C') 2163: | (1 << '\r') 2164: | (1 << '\u001C') 2165: | (1 << '\u001D') 2166: | (1 << '\u001E') 2167: | (1 << '\u001F'))) != 0); 2168: } 2169: 2170: /** 2171: * Determines if a character has the ISO Control property. 2172: * <br> 2173: * ISO Control = [Cc] 2174: * 2175: * @param ch character to test 2176: * @return true if ch is an ISO Control character, else false 2177: * @see #isSpaceChar(char) 2178: * @see #isWhitespace(char) 2179: * @since 1.1 2180: */ 2181: public static boolean isISOControl(char ch) 2182: { 2183: return getType(ch) == CONTROL; 2184: } 2185: 2186: /** 2187: * Returns the Unicode general category property of a character. 2188: * 2189: * @param ch character from which the general category property will 2190: * be retrieved 2191: * @return the character category property of ch as an integer 2192: * @see #UNASSIGNED 2193: * @see #UPPERCASE_LETTER 2194: * @see #LOWERCASE_LETTER 2195: * @see #TITLECASE_LETTER 2196: * @see #MODIFIER_LETTER 2197: * @see #OTHER_LETTER 2198: * @see #NON_SPACING_MARK 2199: * @see #ENCLOSING_MARK 2200: * @see #COMBINING_SPACING_MARK 2201: * @see #DECIMAL_DIGIT_NUMBER 2202: * @see #LETTER_NUMBER 2203: * @see #OTHER_NUMBER 2204: * @see #SPACE_SEPARATOR 2205: * @see #LINE_SEPARATOR 2206: * @see #PARAGRAPH_SEPARATOR 2207: * @see #CONTROL 2208: * @see #FORMAT 2209: * @see #PRIVATE_USE 2210: * @see #SURROGATE 2211: * @see #DASH_PUNCTUATION 2212: * @see #START_PUNCTUATION 2213: * @see #END_PUNCTUATION 2214: * @see #CONNECTOR_PUNCTUATION 2215: * @see #OTHER_PUNCTUATION 2216: * @see #MATH_SYMBOL 2217: * @see #CURRENCY_SYMBOL 2218: * @see #MODIFIER_SYMBOL 2219: * @see #INITIAL_QUOTE_PUNCTUATION 2220: * @see #FINAL_QUOTE_PUNCTUATION 2221: * @since 1.1 2222: */ 2223: public static int getType(char ch) 2224: { 2225: return readChar(ch) & TYPE_MASK; 2226: } 2227: 2228: /** 2229: * Converts a digit into a character which represents that digit 2230: * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 2231: * or the digit exceeds the radix, then the null character <code>'\0'</code> 2232: * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 2233: * <br> 2234: * return value boundary = U+0030-U+0039|U+0061-U+007A 2235: * 2236: * @param digit digit to be converted into a character 2237: * @param radix radix of digit 2238: * @return character representing digit in radix, or '\0' 2239: * @see #MIN_RADIX 2240: * @see #MAX_RADIX 2241: * @see #digit(char, int) 2242: */ 2243: public static char forDigit(int digit, int radix) 2244: { 2245: if (radix < MIN_RADIX || radix > MAX_RADIX 2246: || digit < 0 || digit >= radix) 2247: return '\0'; 2248: return Number.digits[digit]; 2249: } 2250: 2251: /** 2252: * Returns the Unicode directionality property of the character. This 2253: * is used in the visual ordering of text. 2254: * 2255: * @param ch the character to look up 2256: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 2257: * @see #DIRECTIONALITY_UNDEFINED 2258: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 2259: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 2260: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 2261: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 2262: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 2263: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 2264: * @see #DIRECTIONALITY_ARABIC_NUMBER 2265: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 2266: * @see #DIRECTIONALITY_NONSPACING_MARK 2267: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 2268: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 2269: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 2270: * @see #DIRECTIONALITY_WHITESPACE 2271: * @see #DIRECTIONALITY_OTHER_NEUTRALS 2272: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 2273: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 2274: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 2275: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 2276: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 2277: * @since 1.4 2278: */ 2279: public static byte getDirectionality(char ch) 2280: { 2281: // The result will correctly be signed. 2282: return (byte) (direction[readChar(ch) >> 7] >> 2); 2283: } 2284: 2285: /** 2286: * Determines whether the character is mirrored according to Unicode. For 2287: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 2288: * left-to-right text, but ')' in right-to-left text. 2289: * 2290: * @param ch the character to look up 2291: * @return true if the character is mirrored 2292: * @since 1.4 2293: */ 2294: public static boolean isMirrored(char ch) 2295: { 2296: return (readChar(ch) & MIRROR_MASK) != 0; 2297: } 2298: 2299: /** 2300: * Compares another Character to this Character, numerically. 2301: * 2302: * @param anotherCharacter Character to compare with this Character 2303: * @return a negative integer if this Character is less than 2304: * anotherCharacter, zero if this Character is equal, and 2305: * a positive integer if this Character is greater 2306: * @throws NullPointerException if anotherCharacter is null 2307: * @since 1.2 2308: */ 2309: public int compareTo(Character anotherCharacter) 2310: { 2311: return value - anotherCharacter.value; 2312: } 2313: 2314: /** 2315: * Compares an object to this Character. Assuming the object is a 2316: * Character object, this method performs the same comparison as 2317: * compareTo(Character). 2318: * 2319: * @param o object to compare 2320: * @return the comparison value 2321: * @throws ClassCastException if o is not a Character object 2322: * @throws NullPointerException if o is null 2323: * @see #compareTo(Character) 2324: * @since 1.2 2325: */ 2326: public int compareTo(Object o) 2327: { 2328: return compareTo((Character) o); 2329: } 2330: 2331: /** 2332: * Returns an <code>Character</code> object wrapping the value. 2333: * In contrast to the <code>Character</code> constructor, this method 2334: * will cache some values. It is used by boxing conversion. 2335: * 2336: * @param val the value to wrap 2337: * @return the <code>Character</code> 2338: * 2339: * @since 1.5 2340: */ 2341: public static Character valueOf(char val) 2342: { 2343: if (val > MAX_CACHE) 2344: return new Character(val); 2345: synchronized (charCache) 2346: { 2347: if (charCache[val - MIN_VALUE] == null) 2348: charCache[val - MIN_VALUE] = new Character(val); 2349: return charCache[val - MIN_VALUE]; 2350: } 2351: } 2352: 2353: /** 2354: * Reverse the bytes in val. 2355: * @since 1.5 2356: */ 2357: public static char reverseBytes(char val) 2358: { 2359: return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00)); 2360: } 2361: 2362: /** 2363: * Converts a unicode code point to a UTF-16 representation of that 2364: * code point. 2365: * 2366: * @param codePoint the unicode code point 2367: * 2368: * @return the UTF-16 representation of that code point 2369: * 2370: * @throws IllegalArgumentException if the code point is not a valid 2371: * unicode code point 2372: * 2373: * @since 1.5 2374: */ 2375: public static char[] toChars(int codePoint) 2376: { 2377: char[] result = new char[charCount(codePoint)]; 2378: int ignore = toChars(codePoint, result, 0); 2379: return result; 2380: } 2381: 2382: /** 2383: * Converts a unicode code point to its UTF-16 representation. 2384: * 2385: * @param codePoint the unicode code point 2386: * @param dst the target char array 2387: * @param dstIndex the start index for the target 2388: * 2389: * @return number of characters written to <code>dst</code> 2390: * 2391: * @throws IllegalArgumentException if <code>codePoint</code> is not a 2392: * valid unicode code point 2393: * @throws NullPointerException if <code>dst</code> is <code>null</code> 2394: * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 2395: * in <code>dst</code> or if the UTF-16 representation does not 2396: * fit into <code>dst</code> 2397: * 2398: * @since 1.5 2399: */ 2400: public static int toChars(int codePoint, char[] dst, int dstIndex) 2401: { 2402: if (!isValidCodePoint(codePoint)) 2403: { 2404: throw new IllegalArgumentException("not a valid code point: " 2405: + codePoint); 2406: } 2407: 2408: int result; 2409: if (isSupplementaryCodePoint(codePoint)) 2410: { 2411: // Write second char first to cause IndexOutOfBoundsException 2412: // immediately. 2413: final int cp2 = codePoint - 0x10000; 2414: dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE); 2415: dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE); 2416: result = 2; 2417: } 2418: else 2419: { 2420: dst[dstIndex] = (char) codePoint; 2421: result = 1; 2422: } 2423: return result; 2424: } 2425: 2426: /** 2427: * Return number of 16-bit characters required to represent the given 2428: * code point. 2429: * 2430: * @param codePoint a unicode code point 2431: * 2432: * @return 2 if codePoint >= 0x10000, 1 otherwise. 2433: * 2434: * @since 1.5 2435: */ 2436: public static int charCount(int codePoint) 2437: { 2438: return 2439: (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 2440: ? 2 2441: : 1; 2442: } 2443: 2444: /** 2445: * Determines whether the specified code point is 2446: * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 2447: * supplementary character range. 2448: * 2449: * @param codePoint a Unicode code point 2450: * 2451: * @return <code>true</code> if code point is in supplementary range 2452: * 2453: * @since 1.5 2454: */ 2455: public static boolean isSupplementaryCodePoint(int codePoint) 2456: { 2457: return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 2458: && codePoint <= MAX_CODE_POINT; 2459: } 2460: 2461: /** 2462: * Determines whether the specified code point is 2463: * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 2464: * 2465: * @param codePoint a Unicode code point 2466: * 2467: * @return <code>true</code> if code point is valid 2468: * 2469: * @since 1.5 2470: */ 2471: public static boolean isValidCodePoint(int codePoint) 2472: { 2473: return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 2474: } 2475: 2476: /** 2477: * Return true if the given character is a high surrogate. 2478: * @param ch the character 2479: * @return true if the character is a high surrogate character 2480: * 2481: * @since 1.5 2482: */ 2483: public static boolean isHighSurrogate(char ch) 2484: { 2485: return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE; 2486: } 2487: 2488: /** 2489: * Return true if the given character is a low surrogate. 2490: * @param ch the character 2491: * @return true if the character is a low surrogate character 2492: * 2493: * @since 1.5 2494: */ 2495: public static boolean isLowSurrogate(char ch) 2496: { 2497: return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE; 2498: } 2499: 2500: /** 2501: * Return true if the given characters compose a surrogate pair. 2502: * This is true if the first character is a high surrogate and the 2503: * second character is a low surrogate. 2504: * @param ch1 the first character 2505: * @param ch2 the first character 2506: * @return true if the characters compose a surrogate pair 2507: * 2508: * @since 1.5 2509: */ 2510: public static boolean isSurrogatePair(char ch1, char ch2) 2511: { 2512: return isHighSurrogate(ch1) && isLowSurrogate(ch2); 2513: } 2514: 2515: /** 2516: * Given a valid surrogate pair, this returns the corresponding 2517: * code point. 2518: * @param high the high character of the pair 2519: * @param low the low character of the pair 2520: * @return the corresponding code point 2521: * 2522: * @since 1.5 2523: */ 2524: public static int toCodePoint(char high, char low) 2525: { 2526: return ((high - MIN_HIGH_SURROGATE) * 0x400) + 2527: (low - MIN_LOW_SURROGATE) + 0x10000; 2528: } 2529: 2530: /** 2531: * Get the code point at the specified index in the CharSequence. 2532: * This is like CharSequence#charAt(int), but if the character is 2533: * the start of a surrogate pair, and there is a following 2534: * character, and this character completes the pair, then the 2535: * corresponding supplementary code point is returned. Otherwise, 2536: * the character at the index is returned. 2537: * 2538: * @param sequence the CharSequence 2539: * @param index the index of the codepoint to get, starting at 0 2540: * @return the codepoint at the specified index 2541: * @throws IndexOutOfBoundsException if index is negative or >= length() 2542: * @since 1.5 2543: */ 2544: public static int codePointAt(CharSequence sequence, int index) 2545: { 2546: int len = sequence.length(); 2547: if (index < 0 || index >= len) 2548: throw new IndexOutOfBoundsException(); 2549: char high = sequence.charAt(index); 2550: if (! isHighSurrogate(high) || ++index >= len) 2551: return high; 2552: char low = sequence.charAt(index); 2553: if (! isLowSurrogate(low)) 2554: return high; 2555: return toCodePoint(high, low); 2556: } 2557: 2558: /** 2559: * Get the code point at the specified index in the CharSequence. 2560: * If the character is the start of a surrogate pair, and there is a 2561: * following character, and this character completes the pair, then 2562: * the corresponding supplementary code point is returned. 2563: * Otherwise, the character at the index is returned. 2564: * 2565: * @param chars the character array in which to look 2566: * @param index the index of the codepoint to get, starting at 0 2567: * @return the codepoint at the specified index 2568: * @throws IndexOutOfBoundsException if index is negative or >= length() 2569: * @since 1.5 2570: */ 2571: public static int codePointAt(char[] chars, int index) 2572: { 2573: return codePointAt(chars, index, chars.length); 2574: } 2575: 2576: /** 2577: * Get the code point at the specified index in the CharSequence. 2578: * If the character is the start of a surrogate pair, and there is a 2579: * following character within the specified range, and this 2580: * character completes the pair, then the corresponding 2581: * supplementary code point is returned. Otherwise, the character 2582: * at the index is returned. 2583: * 2584: * @param chars the character array in which to look 2585: * @param index the index of the codepoint to get, starting at 0 2586: * @param limit the limit past which characters should not be examined 2587: * @return the codepoint at the specified index 2588: * @throws IndexOutOfBoundsException if index is negative or >= 2589: * limit, or if limit is negative or >= the length of the array 2590: * @since 1.5 2591: */ 2592: public static int codePointAt(char[] chars, int index, int limit) 2593: { 2594: if (index < 0 || index >= limit || limit < 0 || limit >= chars.length) 2595: throw new IndexOutOfBoundsException(); 2596: char high = chars[index]; 2597: if (! isHighSurrogate(high) || ++index >= limit) 2598: return high; 2599: char low = chars[index]; 2600: if (! isLowSurrogate(low)) 2601: return high; 2602: return toCodePoint(high, low); 2603: } 2604: 2605: /** 2606: * Get the code point before the specified index. This is like 2607: * #codePointAt(char[], int), but checks the characters at 2608: * <code>index-1</code> and <code>index-2</code> to see if they form 2609: * a supplementary code point. If they do not, the character at 2610: * <code>index-1</code> is returned. 2611: * 2612: * @param chars the character array 2613: * @param index the index just past the codepoint to get, starting at 0 2614: * @return the codepoint at the specified index 2615: * @throws IndexOutOfBoundsException if index is negative or >= length() 2616: * @since 1.5 2617: */ 2618: public static int codePointBefore(char[] chars, int index) 2619: { 2620: return codePointBefore(chars, index, 1); 2621: } 2622: 2623: /** 2624: * Get the code point before the specified index. This is like 2625: * #codePointAt(char[], int), but checks the characters at 2626: * <code>index-1</code> and <code>index-2</code> to see if they form 2627: * a supplementary code point. If they do not, the character at 2628: * <code>index-1</code> is returned. The start parameter is used to 2629: * limit the range of the array which may be examined. 2630: * 2631: * @param chars the character array 2632: * @param index the index just past the codepoint to get, starting at 0 2633: * @param start the index before which characters should not be examined 2634: * @return the codepoint at the specified index 2635: * @throws IndexOutOfBoundsException if index is > start or > 2636: * the length of the array, or if limit is negative or >= the 2637: * length of the array 2638: * @since 1.5 2639: */ 2640: public static int codePointBefore(char[] chars, int index, int start) 2641: { 2642: if (index < start || index > chars.length 2643: || start < 0 || start >= chars.length) 2644: throw new IndexOutOfBoundsException(); 2645: --index; 2646: char low = chars[index]; 2647: if (! isLowSurrogate(low) || --index < start) 2648: return low; 2649: char high = chars[index]; 2650: if (! isHighSurrogate(high)) 2651: return low; 2652: return toCodePoint(high, low); 2653: } 2654: 2655: /** 2656: * Get the code point before the specified index. This is like 2657: * #codePointAt(CharSequence, int), but checks the characters at 2658: * <code>index-1</code> and <code>index-2</code> to see if they form 2659: * a supplementary code point. If they do not, the character at 2660: * <code>index-1</code> is returned. 2661: * 2662: * @param sequence the CharSequence 2663: * @param index the index just past the codepoint to get, starting at 0 2664: * @return the codepoint at the specified index 2665: * @throws IndexOutOfBoundsException if index is negative or >= length() 2666: * @since 1.5 2667: */ 2668: public static int codePointBefore(CharSequence sequence, int index) 2669: { 2670: int len = sequence.length(); 2671: if (index < 1 || index > len) 2672: throw new IndexOutOfBoundsException(); 2673: --index; 2674: char low = sequence.charAt(index); 2675: if (! isLowSurrogate(low) || --index < 0) 2676: return low; 2677: char high = sequence.charAt(index); 2678: if (! isHighSurrogate(high)) 2679: return low; 2680: return toCodePoint(high, low); 2681: } 2682: } // class Character
GNU Classpath (0.20) |