001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.lang; 018 019 import java.io.Serializable; 020 import java.util.HashMap; 021 import java.util.HashSet; 022 import java.util.Iterator; 023 import java.util.Map; 024 import java.util.Set; 025 026 /** 027 * <p>A set of characters.</p> 028 * 029 * <p>Instances are immutable, but instances of subclasses may not be.</p> 030 * 031 * @author Stephen Colebourne 032 * @author Phil Steitz 033 * @author Pete Gieser 034 * @author Gary Gregory 035 * @since 1.0 036 * @version $Id: CharSet.java 618884 2008-02-06 04:37:17Z bayard $ 037 */ 038 public class CharSet implements Serializable { 039 040 /** 041 * Required for serialization support. Lang version 2.0. 042 * 043 * @see java.io.Serializable 044 */ 045 private static final long serialVersionUID = 5947847346149275958L; 046 047 /** 048 * A CharSet defining no characters. 049 * @since 2.0 050 */ 051 public static final CharSet EMPTY = new CharSet((String) null); 052 053 /** 054 * A CharSet defining ASCII alphabetic characters "a-zA-Z". 055 * @since 2.0 056 */ 057 public static final CharSet ASCII_ALPHA = new CharSet("a-zA-Z"); 058 059 /** 060 * A CharSet defining ASCII alphabetic characters "a-z". 061 * @since 2.0 062 */ 063 public static final CharSet ASCII_ALPHA_LOWER = new CharSet("a-z"); 064 065 /** 066 * A CharSet defining ASCII alphabetic characters "A-Z". 067 * @since 2.0 068 */ 069 public static final CharSet ASCII_ALPHA_UPPER = new CharSet("A-Z"); 070 071 /** 072 * A CharSet defining ASCII alphabetic characters "0-9". 073 * @since 2.0 074 */ 075 public static final CharSet ASCII_NUMERIC = new CharSet("0-9"); 076 077 /** 078 * A Map of the common cases used in the factory. 079 * Subclasses can add more common patterns if desired. 080 * @since 2.0 081 */ 082 protected static final Map COMMON = new HashMap(); 083 084 static { 085 COMMON.put(null, EMPTY); 086 COMMON.put("", EMPTY); 087 COMMON.put("a-zA-Z", ASCII_ALPHA); 088 COMMON.put("A-Za-z", ASCII_ALPHA); 089 COMMON.put("a-z", ASCII_ALPHA_LOWER); 090 COMMON.put("A-Z", ASCII_ALPHA_UPPER); 091 COMMON.put("0-9", ASCII_NUMERIC); 092 } 093 094 /** The set of CharRange objects. */ 095 private Set set = new HashSet(); 096 097 //----------------------------------------------------------------------- 098 /** 099 * <p>Factory method to create a new CharSet using a special syntax.</p> 100 * 101 * <ul> 102 * <li><code>null</code> or empty string ("") 103 * - set containing no characters</li> 104 * <li>Single character, such as "a" 105 * - set containing just that character</li> 106 * <li>Multi character, such as "a-e" 107 * - set containing characters from one character to the other</li> 108 * <li>Negated, such as "^a" or "^a-e" 109 * - set containing all characters except those defined</li> 110 * <li>Combinations, such as "abe-g" 111 * - set containing all the characters from the individual sets</li> 112 * </ul> 113 * 114 * <p>The matching order is:</p> 115 * <ol> 116 * <li>Negated multi character range, such as "^a-e" 117 * <li>Ordinary multi character range, such as "a-e" 118 * <li>Negated single character, such as "^a" 119 * <li>Ordinary single character, such as "a" 120 * </ol> 121 * <p>Matching works left to right. Once a match is found the 122 * search starts again from the next character.</p> 123 * 124 * <p>If the same range is defined twice using the same syntax, only 125 * one range will be kept. 126 * Thus, "a-ca-c" creates only one range of "a-c".</p> 127 * 128 * <p>If the start and end of a range are in the wrong order, 129 * they are reversed. Thus "a-e" is the same as "e-a". 130 * As a result, "a-ee-a" would create only one range, 131 * as the "a-e" and "e-a" are the same.</p> 132 * 133 * <p>The set of characters represented is the union of the specified ranges.</p> 134 * 135 * <p>All CharSet objects returned by this method will be immutable.</p> 136 * 137 * @param setStr the String describing the set, may be null 138 * @return a CharSet instance 139 * @since 2.0 140 */ 141 public static CharSet getInstance(String setStr) { 142 Object set = COMMON.get(setStr); 143 if (set != null) { 144 return (CharSet) set; 145 } 146 return new CharSet(setStr); 147 } 148 149 /** 150 * <p>Constructs a new CharSet using the set syntax. 151 * Each string is merged in with the set.</p> 152 * 153 * @param setStrs Strings to merge into the initial set, may be null 154 * @return a CharSet instance 155 * @since 2.4 156 */ 157 public static CharSet getInstance(String[] setStrs) { 158 if (setStrs == null) { 159 return null; 160 } 161 return new CharSet(setStrs); 162 } 163 164 //----------------------------------------------------------------------- 165 /** 166 * <p>Constructs a new CharSet using the set syntax.</p> 167 * 168 * @param setStr the String describing the set, may be null 169 * @since 2.0 170 */ 171 protected CharSet(String setStr) { 172 super(); 173 add(setStr); 174 } 175 176 /** 177 * <p>Constructs a new CharSet using the set syntax. 178 * Each string is merged in with the set.</p> 179 * 180 * @param set Strings to merge into the initial set 181 * @throws NullPointerException if set is <code>null</code> 182 */ 183 protected CharSet(String[] set) { 184 super(); 185 int sz = set.length; 186 for (int i = 0; i < sz; i++) { 187 add(set[i]); 188 } 189 } 190 191 //----------------------------------------------------------------------- 192 /** 193 * <p>Add a set definition string to the <code>CharSet</code>.</p> 194 * 195 * @param str set definition string 196 */ 197 protected void add(String str) { 198 if (str == null) { 199 return; 200 } 201 202 int len = str.length(); 203 int pos = 0; 204 while (pos < len) { 205 int remainder = (len - pos); 206 if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') { 207 // negated range 208 set.add(new CharRange(str.charAt(pos + 1), str.charAt(pos + 3), true)); 209 pos += 4; 210 } else if (remainder >= 3 && str.charAt(pos + 1) == '-') { 211 // range 212 set.add(new CharRange(str.charAt(pos), str.charAt(pos + 2))); 213 pos += 3; 214 } else if (remainder >= 2 && str.charAt(pos) == '^') { 215 // negated char 216 set.add(new CharRange(str.charAt(pos + 1), true)); 217 pos += 2; 218 } else { 219 // char 220 set.add(new CharRange(str.charAt(pos))); 221 pos += 1; 222 } 223 } 224 } 225 226 //----------------------------------------------------------------------- 227 /** 228 * <p>Gets the internal set as an array of CharRange objects.</p> 229 * 230 * @return an array of immutable CharRange objects 231 * @since 2.0 232 */ 233 public CharRange[] getCharRanges() { 234 return (CharRange[]) set.toArray(new CharRange[set.size()]); 235 } 236 237 //----------------------------------------------------------------------- 238 /** 239 * <p>Does the <code>CharSet</code> contain the specified 240 * character <code>ch</code>.</p> 241 * 242 * @param ch the character to check for 243 * @return <code>true</code> if the set contains the characters 244 */ 245 public boolean contains(char ch) { 246 for (Iterator it = set.iterator(); it.hasNext();) { 247 CharRange range = (CharRange) it.next(); 248 if (range.contains(ch)) { 249 return true; 250 } 251 } 252 return false; 253 } 254 255 // Basics 256 //----------------------------------------------------------------------- 257 /** 258 * <p>Compares two CharSet objects, returning true if they represent 259 * exactly the same set of characters defined in the same way.</p> 260 * 261 * <p>The two sets <code>abc</code> and <code>a-c</code> are <i>not</i> 262 * equal according to this method.</p> 263 * 264 * @param obj the object to compare to 265 * @return true if equal 266 * @since 2.0 267 */ 268 public boolean equals(Object obj) { 269 if (obj == this) { 270 return true; 271 } 272 if (obj instanceof CharSet == false) { 273 return false; 274 } 275 CharSet other = (CharSet) obj; 276 return set.equals(other.set); 277 } 278 279 /** 280 * <p>Gets a hashCode compatible with the equals method.</p> 281 * 282 * @return a suitable hashCode 283 * @since 2.0 284 */ 285 public int hashCode() { 286 return 89 + set.hashCode(); 287 } 288 289 /** 290 * <p>Gets a string representation of the set.</p> 291 * 292 * @return string representation of the set 293 */ 294 public String toString() { 295 return set.toString(); 296 } 297 298 }