001 /* 002 * CDDL HEADER START 003 * 004 * The contents of this file are subject to the terms of the 005 * Common Development and Distribution License, Version 1.0 only 006 * (the "License"). You may not use this file except in compliance 007 * with the License. 008 * 009 * You can obtain a copy of the license at 010 * trunk/opends/resource/legal-notices/OpenDS.LICENSE 011 * or https://OpenDS.dev.java.net/OpenDS.LICENSE. 012 * See the License for the specific language governing permissions 013 * and limitations under the License. 014 * 015 * When distributing Covered Code, include this CDDL HEADER in each 016 * file and include the License file at 017 * trunk/opends/resource/legal-notices/OpenDS.LICENSE. If applicable, 018 * add the following below this CDDL HEADER, with the fields enclosed 019 * by brackets "[]" replaced with your own identifying information: 020 * Portions Copyright [yyyy] [name of copyright owner] 021 * 022 * CDDL HEADER END 023 * 024 * 025 * Copyright 2006-2008 Sun Microsystems, Inc. 026 */ 027 package org.opends.server.schema; 028 029 030 031 import java.util.Arrays; 032 033 import org.opends.server.admin.std.server.ApproximateMatchingRuleCfg; 034 import org.opends.server.api.ApproximateMatchingRule; 035 import org.opends.server.config.ConfigException; 036 import org.opends.server.protocols.asn1.ASN1OctetString; 037 import org.opends.server.types.ByteString; 038 import org.opends.server.types.DirectoryException; 039 import org.opends.server.types.InitializationException; 040 import org.opends.server.types.DebugLogLevel; 041 042 import static org.opends.server.loggers.debug.DebugLogger.*; 043 import org.opends.server.loggers.debug.DebugTracer; 044 import static org.opends.server.schema.SchemaConstants.*; 045 046 047 048 /** 049 * This class defines an approximate matching rule based on the Double Metaphone 050 * algorithm. The Metaphone and Double Metaphone algorithms were originally 051 * devised by Lawrence Philips (published in the December 1990 issue of 052 * <I>Computer Language</I> and the 053 * <A HREF="http://www.cuj.com/documents/s=8038/cuj0006philips/">June 2000 issue 054 * of <I>C/C++ Users Journal</I></A>, respectively), and this version of the 055 * algorithm is based on a version modified by Kevin Atkinson to include 056 * bugfixes and additional functionality (source is available 057 * <A HREF="http://aspell.net/metaphone/dmetaph.cpp">here</A> and additional 058 * Metaphone and Double Metaphone information is available at 059 * <A HREF="http://aspell.net/metaphone/">http://aspell.net/metaphone/</A>). 060 * This implementation is largely the same as the one provided by Kevin 061 * Atkinson, but it has been re-written for better readability, for more 062 * efficiency, to get rid of checks for conditions that can't possibly happen, 063 * and to get rid of redundant checks that aren't needed. It has also been 064 * updated to always only generate a single value rather than one or possibly 065 * two values. 066 */ 067 public class DoubleMetaphoneApproximateMatchingRule 068 extends ApproximateMatchingRule 069 { 070 /** 071 * The tracer object for the debug logger. 072 */ 073 private static final DebugTracer TRACER = getTracer(); 074 075 076 077 /** 078 * Creates a new instance of this double metaphone approximate matching rule. 079 */ 080 public DoubleMetaphoneApproximateMatchingRule() 081 { 082 super(); 083 } 084 085 086 087 /** 088 * {@inheritDoc} 089 */ 090 public void initializeMatchingRule(ApproximateMatchingRuleCfg configuration) 091 throws ConfigException, InitializationException 092 { 093 // No initialization is required. 094 } 095 096 097 098 /** 099 * Retrieves the common name for this matching rule. 100 * 101 * @return The common name for this matching rule, or <CODE>null</CODE> if 102 * it does not have a name. 103 */ 104 public String getName() 105 { 106 return AMR_DOUBLE_METAPHONE_NAME; 107 } 108 109 110 111 /** 112 * Retrieves the OID for this matching rule. 113 * 114 * @return The OID for this matching rule. 115 */ 116 public String getOID() 117 { 118 return AMR_DOUBLE_METAPHONE_OID; 119 } 120 121 122 123 /** 124 * Retrieves the description for this matching rule. 125 * 126 * @return The description for this matching rule, or <CODE>null</CODE> if 127 * there is none. 128 */ 129 public String getDescription() 130 { 131 // There is no standard description for this matching rule. 132 return AMR_DOUBLE_METAPHONE_DESCRIPTION; 133 } 134 135 136 137 /** 138 * Retrieves the OID of the syntax with which this matching rule is 139 * associated. 140 * 141 * @return The OID of the syntax with which this matching rule is associated. 142 */ 143 public String getSyntaxOID() 144 { 145 // Approximate matching is really only appropriate for DirectoryString 146 // values. 147 return SYNTAX_DIRECTORY_STRING_OID; 148 } 149 150 151 152 /** 153 * Retrieves the normalized form of the provided value, which is best suited 154 * for efficiently performing matching operations on that value. 155 * 156 * @param value The value to be normalized. 157 * 158 * @return The normalized version of the provided value. 159 * 160 * @throws DirectoryException If the provided value is invalid according to 161 * the associated attribute syntax. 162 */ 163 public ByteString normalizeValue(ByteString value) 164 throws DirectoryException 165 { 166 String valueString = value.stringValue(); 167 int length = valueString.length(); 168 if (length == 0) 169 { 170 // The value is empty, so it is already normalized. 171 return new ASN1OctetString(); 172 } 173 174 int last = length - 1; 175 176 177 // Pad the value to allow for checks to go past the end of the value. 178 valueString = valueString.toUpperCase() + " "; 179 180 181 // The metaphone value that is being constructed. 182 StringBuilder metaphone = new StringBuilder(4); 183 184 185 // Skip over GN, KN, PN, WR, and PS at the beginning of a word. 186 int pos = 0; 187 String substring = valueString.substring(0, 2); 188 if (substring.equals("GN") || substring.equals("KN") || 189 substring.equals("PN") || substring.equals("WR") || 190 substring.equals("PS")) 191 { 192 pos++; 193 } 194 195 196 // 'X' at the beginning of a word will sound like Z, but Z will always be 197 // mapped to S. 198 else if (valueString.charAt(0) == 'X') 199 { 200 metaphone.append("S"); 201 pos++; 202 } 203 204 205 // Loop until we have at least four metaphone characters or have reached the 206 // end of the string. 207 while ((metaphone.length() < 4) && (pos < length)) 208 { 209 // Check the character at the current position against various targets. 210 char posMinusFour; 211 char posMinusThree; 212 char posMinusTwo; 213 char posMinusOne; 214 char posPlusOne; 215 char posPlusTwo; 216 switch (valueString.charAt(pos)) 217 { 218 case 'A': 219 case 'E': 220 case 'I': 221 case 'O': 222 case 'U': 223 case 'Y': 224 // All initial vowels map to 'A'. All others will be ignored. 225 if (pos == 0) 226 { 227 metaphone.append("A"); 228 } 229 230 pos++; 231 break; 232 233 234 case 'B': 235 // B and BB will be mapped to P, with the exception of "MB" as in 236 // "crumb", but that will be handled elsewhere. 237 metaphone.append("P"); 238 239 if (valueString.charAt(++pos) == 'B') 240 { 241 pos++; 242 } 243 244 break; 245 246 247 case 'C': 248 // Check for various Germanic sequences, which will be mapped to 'K'. 249 // This basically includes all occurrences of "ACH" where the 250 // preceding character is not a vowel and the following character is 251 // neither an 'E' nor an 'I' except in "BACHER" and "MACHER". 252 if ((pos > 1) && 253 (! isVowel(posMinusTwo = valueString.charAt(pos-2))) && 254 hasSubstring(valueString, pos-1, "ACH") && 255 ((posPlusTwo = valueString.charAt(pos+2)) != 'I') && 256 ((posPlusTwo != 'E') || 257 ((valueString.charAt(pos+3) == 'R') && 258 ((posMinusTwo == 'B') || (posMinusTwo == 'M'))))) 259 { 260 metaphone.append("K"); 261 pos += 2; 262 break; 263 } 264 265 266 // Check for a special case of "caesar", which will be maped to 'S'. 267 if ((pos == 0) && hasSubstring(valueString, pos+1, "AESAR")) 268 { 269 metaphone.append("S"); 270 pos += 2; 271 break; 272 } 273 274 275 // CH can be treated in lots of different ways. 276 if ((posPlusOne = valueString.charAt(pos+1)) == 'H') 277 { 278 // Check for "chia" as in "chianti" and map to 'K'. 279 if (hasSubstring(valueString, pos+2, "IA")) 280 { 281 metaphone.append("K"); 282 pos += 2; 283 break; 284 } 285 286 // Check for "chae" as in "michael" and map to 'K'. 287 if (hasSubstring(valueString, pos+2, "AE")) 288 { 289 metaphone.append("K"); 290 pos += 2; 291 break; 292 } 293 294 // Check for a Greek root at the beginning of the value like 295 // chemistry or chorus and map to 'K'. 296 if ((pos == 0) && (! hasSubstring(valueString, 2, "ORE")) && 297 (hasSubstring(valueString, 2, "ARAC") || 298 hasSubstring(valueString, 2, "ARIS") || 299 hasSubstring(valueString, 2, "OR") || 300 hasSubstring(valueString, 2, "YM") || 301 hasSubstring(valueString, 2, "IA") || 302 hasSubstring(valueString, 2, "EM"))) 303 { 304 metaphone.append("K"); 305 pos += 2; 306 break; 307 } 308 309 310 // Check for "CH" values that produce a "KH" sound that will be 311 // mapped to 'K'. 312 if (isGermanic(valueString) || 313 hasSubstring(valueString, pos-2, "ORCHES") || 314 hasSubstring(valueString, pos-2, "ARCHIT") || 315 hasSubstring(valueString, pos-2, "ORCHID") || 316 ((posPlusTwo = valueString.charAt(pos+2)) == 'T') || 317 (posPlusTwo == 'S') || 318 (((pos == 0) || 319 (((posMinusOne = valueString.charAt(pos-1)) == 'A') || 320 (posMinusOne == 'O') || (posMinusOne == 'U') || 321 (posMinusOne == 'E'))) && 322 ((posPlusTwo == 'L') || (posPlusTwo == 'R') || 323 (posPlusTwo == 'N')|| (posPlusTwo == 'M') || 324 (posPlusTwo == 'B')|| (posPlusTwo == 'H') || 325 (posPlusTwo == 'F')|| (posPlusTwo == 'V') || 326 (posPlusTwo == 'W')))) 327 { 328 metaphone.append("K"); 329 pos += 2; 330 break; 331 } 332 333 334 // All other "CH" values. 335 if (pos > 0) 336 { 337 if (hasSubstring(valueString, 0, "MC")) 338 { 339 metaphone.append("K"); 340 } 341 else 342 { 343 metaphone.append("X"); 344 } 345 } 346 else 347 { 348 metaphone.append("X"); 349 } 350 351 pos += 2; 352 break; 353 } 354 355 356 // Check for "CZ" as in "czerny" but not "wicz" and map to 'S'. 357 if ((posPlusOne == 'Z') && 358 (! hasSubstring(valueString, pos-2, "WI"))) 359 { 360 metaphone.append("S"); 361 pos += 2; 362 break; 363 } 364 365 366 // Check for "CIA" as in "focaccia" and map to 'X'. 367 if ((posPlusOne == 'I') && (valueString.charAt(pos+2) == 'A')) 368 { 369 metaphone.append("X"); 370 pos += 3; 371 break; 372 } 373 374 375 // Check for a double C but not in values that start with "McC" 376 if ((posPlusOne == 'C') && 377 (! ((pos == 1) && valueString.charAt(0) == 'M'))) 378 { 379 if ((((posPlusTwo = valueString.charAt(pos+2)) == 'I') || 380 (posPlusTwo == 'E') || (posPlusTwo == 'H')) && 381 (! ((posPlusTwo == 'H') && valueString.charAt(pos+3) == 'U'))) 382 { 383 if (((pos == 1) && (valueString.charAt(pos-1) == 'A')) || 384 hasSubstring(valueString, pos-1, "UCCEE") || 385 hasSubstring(valueString, pos-1, "UCCES")) 386 { 387 // Values like "accident", "accede", and "succeed". 388 metaphone.append("K"); 389 pos += 2; 390 break; 391 } 392 else 393 { 394 // Values like "bacci" or "bertucci". 395 metaphone.append("X"); 396 pos += 3; 397 break; 398 } 399 } 400 else 401 { 402 // This is Pierce's Rule, whatever that means. 403 metaphone.append("K"); 404 pos += 2; 405 break; 406 } 407 } 408 409 410 // Check for CK, CG, or CQ and map to 'K'. Check for CI, CE, and CY 411 // and map to "S". 412 if (((posPlusOne = valueString.charAt(pos+1)) == 'K') || 413 (posPlusOne == 'G') || (posPlusOne == 'Q')) 414 { 415 metaphone.append("K"); 416 pos += 2; 417 break; 418 } 419 420 421 // Check for CI, CE, or CY and map to 'S'. 422 if ((posPlusOne == 'I') || (posPlusOne == 'E') || (posPlusOne == 'Y')) 423 { 424 metaphone.append("S"); 425 pos += 2; 426 break; 427 } 428 429 430 // All other cases of "C" will be mapped to 'K'. However, the number 431 // of positions that we skip ahead may vary. If there is a value that 432 // consists of two words like "mac caffrey", then skip ahead three. 433 // For the character combinations of "CK" and "CQ", then skip ahead 434 // two. For the character combinations of "CC" except "CCE" and 435 // "CCI", then skip ahead two. For all other cases, skip ahead one. 436 metaphone.append("K"); 437 switch (valueString.charAt(pos+1)) 438 { 439 case ' ': 440 switch (valueString.charAt(pos+2)) 441 { 442 case 'C': 443 case 'Q': 444 case 'G': 445 pos += 3; 446 break; 447 default: 448 pos++; 449 break; 450 } 451 break; 452 453 case 'K': 454 case 'Q': 455 pos += 2; 456 break; 457 458 case 'C': 459 switch (valueString.charAt(pos+2)) 460 { 461 case 'E': 462 case 'I': 463 pos++; 464 break; 465 default: 466 pos += 2; 467 break; 468 } 469 break; 470 default: 471 pos++; 472 } 473 break; 474 475 476 case 'D': 477 // DG will be mapped to either 'J' (in cases like edge) or 'TK' (in 478 // cases like Edgar). 479 if ((posPlusOne = valueString.charAt(pos+1)) == 'G') 480 { 481 if (((posPlusTwo = valueString.charAt(pos+2)) == 'I') || 482 (posPlusTwo == 'E') || (posPlusTwo == 'Y')) 483 { 484 metaphone.append("J"); 485 pos += 3; 486 break; 487 } 488 else 489 { 490 metaphone.append("TK"); 491 pos += 2; 492 break; 493 } 494 } 495 496 497 // DT and DD will be mapped to 'T'. 498 if ((posPlusOne == 'T') || (posPlusOne == 'D')) 499 { 500 metaphone.append("T"); 501 pos += 2; 502 break; 503 } 504 505 506 // All other cases will be mapped to 'T'. 507 metaphone.append("T"); 508 pos++; 509 break; 510 511 512 case 'F': 513 // F always maps to F. If there is a double F, then skip the second 514 // one. 515 metaphone.append("F"); 516 pos++; 517 if (valueString.charAt(pos) == 'F') 518 { 519 pos++; 520 } 521 break; 522 523 524 case 'G': 525 if ((posPlusOne = valueString.charAt(pos+1)) == 'H') 526 { 527 // A "GH" that is not preceded by a vowel will be mapped to 'K'. 528 if ((pos > 0) && (! isVowel(valueString.charAt(pos-1)))) 529 { 530 metaphone.append("K"); 531 pos += 2; 532 break; 533 } 534 535 if (pos == 0) 536 { 537 if (valueString.charAt(pos+2) == 'I') 538 { 539 // Words like ghislane or ghiradelli 540 metaphone.append("J"); 541 } 542 else 543 { 544 metaphone.append("K"); 545 } 546 547 pos += 2; 548 break; 549 } 550 551 // A refined version of Parker's Rule. 552 if (((pos > 1) && 553 (((posMinusTwo = valueString.charAt(pos-2)) == 'B') || 554 (posMinusTwo == 'H') || (posMinusTwo == 'D'))) || 555 ((pos > 2) && 556 (((posMinusThree = valueString.charAt(pos-3)) == 'B') || 557 (posMinusThree == 'H') || (posMinusThree == 'D'))) || 558 ((pos > 3) && 559 (((posMinusFour = valueString.charAt(pos-4)) == 'B') || 560 (posMinusFour == 'H')))) 561 { 562 pos += 2; 563 break; 564 } 565 else 566 { 567 if ((pos > 2) && (valueString.charAt(pos-1) == 'U') && 568 (((posMinusThree = valueString.charAt(pos-3)) == 'C') || 569 (posMinusThree == 'G') || (posMinusThree == 'L') || 570 (posMinusThree == 'R') || (posMinusThree == 'T'))) 571 { 572 // Words like laugh, McLaughlin, cough, rough are mapped to 'F'. 573 metaphone.append("F"); 574 } 575 else if ((pos > 0) && (valueString.charAt(pos-1) != 'I')) 576 { 577 metaphone.append("K"); 578 } 579 580 pos += 2; 581 break; 582 } 583 } 584 585 586 if (posPlusOne == 'N') 587 { 588 if ((pos == 1) && isVowel(valueString.charAt(0)) && 589 (! isSlavoGermanic(valueString))) 590 { 591 metaphone.append("KN"); 592 pos += 2; 593 break; 594 } 595 else 596 { 597 if ((! hasSubstring(valueString, pos+2, "EY")) && 598 (! isSlavoGermanic(valueString))) 599 { 600 metaphone.append("N"); 601 } 602 else 603 { 604 metaphone.append("KN"); 605 } 606 607 pos += 2; 608 break; 609 } 610 } 611 612 613 // GLI as in tagliaro will be mapped to "KL". 614 if ((posPlusOne == 'L') && (valueString.charAt(pos+2) == 'I')) 615 { 616 metaphone.append("KL"); 617 pos += 2; 618 break; 619 } 620 621 622 // Forms of GY, GE, and GI at the beginning of a word will map to 'K'. 623 if ((pos == 0) && 624 ((posPlusOne == 'Y') || 625 (substring = valueString.substring(pos+1,pos+3)).equals("ES") || 626 substring.equals("EP") || substring.equals("EB") || 627 substring.equals("EL") || substring.equals("EY") || 628 substring.equals("IB") || substring.equals("IL") || 629 substring.equals("IN") || substring.equals("IE") || 630 substring.equals("EI") || substring.equals("ER"))) 631 { 632 metaphone.append("K"); 633 pos += 2; 634 break; 635 } 636 637 638 // Some occurrences of GER and GY in a word will be mapped to 'K'. 639 posPlusTwo = valueString.charAt(pos+2); 640 if ((((posPlusOne == 'E') && (posPlusTwo == 'R')) || 641 (posPlusOne == 'Y')) && 642 ((posMinusOne = valueString.charAt(pos-1)) != 'E') && 643 (posMinusOne != 'I') && 644 (! hasSubstring(valueString, 0, "DANGER")) && 645 (! hasSubstring(valueString, 0, "RANGER")) && 646 (! hasSubstring(valueString, 0, "MANGER")) && 647 (! hasSubstring(valueString, pos-1, "RGY")) && 648 (! hasSubstring(valueString, pos-1, "OGY"))) 649 { 650 metaphone.append("K"); 651 pos += 2; 652 break; 653 } 654 655 656 // Check for Italian uses like 'biaggi" and map to 'J'. 657 if ((posPlusOne == 'E') || (posPlusOne == 'I') || 658 (posPlusOne == 'Y') || 659 hasSubstring(valueString, pos-1, "AGGI") || 660 hasSubstring(valueString, pos-1, "OGGI")) 661 { 662 // Germanic uses will be mapped to 'K'. 663 if (isGermanic(valueString) || 664 hasSubstring(valueString, pos+1, "ET")) 665 { 666 metaphone.append("K"); 667 } 668 else 669 { 670 metaphone.append("J"); 671 } 672 673 pos += 2; 674 break; 675 } 676 677 678 // All other cases will be mapped to 'K'. If there is a double G, 679 // then skip two. Otherwise, just skip one. 680 metaphone.append("K"); 681 pos++; 682 683 if (posPlusOne == 'G') 684 { 685 pos++; 686 } 687 688 break; 689 690 691 case 'H': 692 // The letter 'H' will only be processed if it is immediately followed 693 // by a vowel and is either the start of the word or preceded by a 694 // vowel. 695 if (isVowel(valueString.charAt(pos+1))) 696 { 697 if ((pos == 0) || isVowel(valueString.charAt(pos-1))) 698 { 699 metaphone.append("H"); 700 pos++; 701 } 702 } 703 704 pos++; 705 break; 706 707 708 case 'J': 709 // Take care of obvious Spanish uses that should map to 'H'. 710 if (hasSubstring(valueString, 0, "SAN ")) 711 { 712 metaphone.append("H"); 713 pos++; 714 break; 715 } 716 717 if (hasSubstring(valueString, pos, "JOSE")) 718 { 719 if ((pos == 0) && (valueString.charAt(pos+4) == ' ')) 720 { 721 metaphone.append("H"); 722 } 723 else 724 { 725 metaphone.append("J"); 726 } 727 728 pos++; 729 break; 730 } 731 732 733 // All other cases will be mapped to 'J'. 734 metaphone.append("J"); 735 736 if (valueString.charAt(pos+1) == 'J') 737 { 738 pos++; 739 } 740 741 pos++; 742 break; 743 744 745 case 'K': 746 // 'K' will always be mapped to 'K'. KK will be treated like K. 747 metaphone.append("K"); 748 749 if (valueString.charAt(pos+1) == 'K') 750 { 751 pos++; 752 } 753 754 pos++; 755 break; 756 757 758 case 'L': 759 // 'L' will always be mapped to 'L'. LL will be treated like L, even 760 // for potential Spanish uses. 761 metaphone.append("L"); 762 763 if (valueString.charAt(pos+1) == 'L') 764 { 765 pos++; 766 } 767 768 pos++; 769 break; 770 771 772 case 'M': 773 // 'M' will always be mapped to 'M'. MM will be treated like M. 774 // UMB in cases like "dumb" and "thumb" will be treated like M. 775 metaphone.append("M"); 776 777 if (valueString.charAt(pos+1) == 'M') 778 { 779 pos++; 780 } 781 else if (hasSubstring(valueString, pos-1, "UMB")) 782 { 783 if (((pos+1) == last) || 784 hasSubstring(valueString, pos+2, "ER")) 785 { 786 pos++; 787 } 788 } 789 790 pos++; 791 break; 792 793 794 case 'N': 795 // 'N' will always be mapped to 'N'. NN will be treated like N. 796 metaphone.append("N"); 797 798 if (valueString.charAt(pos+1) == 'N') 799 { 800 pos++; 801 } 802 803 pos++; 804 break; 805 806 807 case 'P': 808 // PH will be mapped to 'F'. 809 if ((posPlusOne = valueString.charAt(pos+1)) == 'H') 810 { 811 metaphone.append("F"); 812 pos += 2; 813 break; 814 } 815 816 817 // All other cases will be mapped to 'P', with PP and PB being treated 818 // like P. 819 metaphone.append("P"); 820 821 if ((posPlusOne == 'P') || (posPlusOne == 'B')) 822 { 823 pos++; 824 } 825 826 pos++; 827 break; 828 829 830 case 'Q': 831 // 'Q' will always be mapped to 'K'. QQ will be treated like Q. 832 metaphone.append("K"); 833 834 if (valueString.charAt(pos+1) == 'Q') 835 { 836 pos++; 837 } 838 839 pos++; 840 break; 841 842 843 case 'R': 844 // Ignore R at the end of French words. 845 if ((pos == last) && (! isSlavoGermanic(valueString)) && 846 hasSubstring(valueString, pos-2, "IE") && 847 (! hasSubstring(valueString, pos-4, "ME")) && 848 (! hasSubstring(valueString, pos-4, "MA"))) 849 { 850 pos++; 851 break; 852 } 853 854 855 // All other cases will be mapped to 'R', with RR treated like R. 856 metaphone.append("R"); 857 858 if (valueString.charAt(pos+1) == 'R') 859 { 860 pos++; 861 } 862 863 pos++; 864 break; 865 866 867 case 'S': 868 // Special cases like isle and carlysle will be silent. 869 if (hasSubstring(valueString, pos-1, "ISL") || 870 hasSubstring(valueString, pos-1, "YSL")) 871 { 872 pos++; 873 break; 874 } 875 876 877 // Special case of sugar mapped to 'X'. 878 if (hasSubstring(valueString, pos+1, "UGAR")) 879 { 880 metaphone.append("X"); 881 pos++; 882 break; 883 } 884 885 886 // SH is generally mapped to 'X', but not in Germanic cases. 887 if ((posPlusOne = valueString.charAt(pos+1)) == 'H') 888 { 889 if (hasSubstring(valueString, pos+1, "HEIM") || 890 hasSubstring(valueString, pos+1, "HOEK") || 891 hasSubstring(valueString, pos+1, "HOLM") || 892 hasSubstring(valueString, pos+1, "HOLZ")) 893 { 894 metaphone.append("S"); 895 } 896 else 897 { 898 metaphone.append("X"); 899 } 900 901 pos += 2; 902 break; 903 } 904 905 906 // Italian and Armenian cases will map to "S". 907 if (hasSubstring(valueString, pos+1, "IO") || 908 hasSubstring(valueString, pos+1, "IA")) 909 { 910 metaphone.append("S"); 911 pos += 3; 912 break; 913 } 914 915 916 // SZ should be mapped to 'S'. 917 if (posPlusOne == 'Z') 918 { 919 metaphone.append("S"); 920 pos += 2; 921 break; 922 } 923 924 925 // Various combinations at the beginning of words will be mapped to 926 // 'S'. 927 if ((pos == 0) && 928 ((posPlusOne == 'M') || (posPlusOne == 'N') || 929 (posPlusOne == 'L') || (posPlusOne == 'W'))) 930 { 931 metaphone.append("S"); 932 pos++; 933 break; 934 } 935 936 937 // SC should be mapped to either SK, X, or S. 938 if (posPlusOne == 'C') 939 { 940 if ((posPlusTwo = valueString.charAt(pos+2)) == 'H') 941 { 942 if (hasSubstring(valueString, pos+3, "OO") || 943 hasSubstring(valueString, pos+3, "UY") || 944 hasSubstring(valueString, pos+3, "ED") || 945 hasSubstring(valueString, pos+3, "EM")) 946 { 947 metaphone.append("SK"); 948 } 949 else 950 { 951 metaphone.append("X"); 952 } 953 954 pos += 3; 955 break; 956 } 957 958 if ((posPlusTwo == 'I') || (posPlusTwo == 'E') || 959 (posPlusTwo == 'Y')) 960 { 961 metaphone.append("S"); 962 pos += 3; 963 break; 964 } 965 966 metaphone.append("SK"); 967 pos += 3; 968 break; 969 } 970 971 972 // Ignore a trailing S in French words. All others will be mapped to 973 // 'S'. 974 if (! ((pos == last) && 975 (hasSubstring(valueString, pos-2, "AI") || 976 hasSubstring(valueString, pos-2, "OI")))) 977 { 978 metaphone.append("S"); 979 } 980 981 if ((posPlusOne == 'S') || (posPlusOne == 'Z')) 982 { 983 pos++; 984 } 985 986 pos++; 987 break; 988 989 990 case 'T': 991 // "TION", "TIA", and "TCH" will be mapped to 'X'. 992 if (hasSubstring(valueString, pos, "TION") || 993 hasSubstring(valueString, pos, "TIA") || 994 hasSubstring(valueString, pos, "TCH")) 995 { 996 metaphone.append("X"); 997 pos += 3; 998 break; 999 } 1000 1001 1002 // TH or TTH will be mapped to either T (for Germanic cases) or 1003 // 0 (zero) for the rest. 1004 if (((posPlusOne = valueString.charAt(pos+1)) == 'H') || 1005 ((posPlusOne == 'T') && (valueString.charAt(pos+2) == 'H'))) 1006 { 1007 if (isGermanic(valueString) || 1008 hasSubstring(valueString, pos+2, "OM") || 1009 hasSubstring(valueString, pos+2, "AM")) 1010 { 1011 metaphone.append("T"); 1012 } 1013 else 1014 { 1015 metaphone.append("0"); 1016 } 1017 1018 pos += 2; 1019 break; 1020 } 1021 1022 1023 // All other cases will map to T, with TT and TD being treated like T. 1024 metaphone.append("T"); 1025 1026 if ((posPlusOne == 'T') || (posPlusOne == 'D')) 1027 { 1028 pos++; 1029 } 1030 1031 pos++; 1032 break; 1033 1034 1035 case 'V': 1036 // 'V' will always be mapped to 'F', with VV treated like V. 1037 metaphone.append("F"); 1038 1039 if (valueString.charAt(pos+1) == 'V') 1040 { 1041 pos++; 1042 } 1043 1044 pos++; 1045 break; 1046 1047 1048 case 'W': 1049 // WR should always map to R. 1050 if ((posPlusOne = valueString.charAt(pos+1)) == 'R') 1051 { 1052 metaphone.append("R"); 1053 pos += 2; 1054 break; 1055 } 1056 1057 1058 // W[AEIOUYH] at the beginning of the word should be mapped to A. 1059 if ((pos == 0) && (isVowel(posPlusOne) || (posPlusOne == 'H'))) 1060 { 1061 metaphone.append("A"); 1062 1063 // FIXME -- This isn't in the algorithm as written. Should it be? 1064 pos += 2; 1065 break; 1066 } 1067 1068 1069 // A Polish value like WICZ or WITZ should be mapped to TS. 1070 if (hasSubstring(valueString, pos+1, "WICZ") || 1071 hasSubstring(valueString, pos+1, "WITZ")) 1072 { 1073 metaphone.append("TS"); 1074 pos += 4; 1075 break; 1076 } 1077 1078 1079 // Otherwise, we'll just skip it. 1080 pos++; 1081 break; 1082 1083 1084 case 'X': 1085 // X maps to KS except at the end of French words. 1086 if (! ((pos == last) && 1087 (hasSubstring(valueString, pos-3, "IAU") || 1088 hasSubstring(valueString, pos-3, "EAU") || 1089 hasSubstring(valueString, pos-2, "AU") || 1090 hasSubstring(valueString, pos-2, "OU")))) 1091 { 1092 metaphone.append("KS"); 1093 } 1094 1095 if (((posPlusOne = valueString.charAt(pos+1)) == 'C') || 1096 (posPlusOne == 'X')) 1097 { 1098 pos++; 1099 } 1100 1101 pos++; 1102 break; 1103 1104 1105 case 'Z': 1106 // Chinese usages like zhao will map to J. 1107 if ((posPlusOne = valueString.charAt(pos+1)) == 'H') 1108 { 1109 metaphone.append("J"); 1110 pos += 2; 1111 break; 1112 } 1113 1114 1115 // All other cases map to "S". ZZ will be treated like Z. 1116 metaphone.append("S"); 1117 1118 if (posPlusOne == 'Z') 1119 { 1120 pos++; 1121 } 1122 1123 pos++; 1124 break; 1125 1126 1127 case '\u00C7': // C with a cedilla 1128 // This will always be mapped to 'S'. 1129 metaphone.append("S"); 1130 pos++; 1131 break; 1132 1133 1134 case '\u00D1': // N with a tilde 1135 // This will always be mapped to 'N'. 1136 metaphone.append("N"); 1137 pos++; 1138 break; 1139 1140 1141 default: 1142 // We don't have any special treatment for this character, so skip it. 1143 pos++; 1144 break; 1145 } 1146 } 1147 1148 1149 return new ASN1OctetString(metaphone.toString()); 1150 } 1151 1152 1153 1154 /** 1155 * Indicates whether the two provided normalized values are approximately 1156 * equal to each other. 1157 * 1158 * @param value1 The normalized form of the first value to compare. 1159 * @param value2 The normalized form of the second value to compare. 1160 * 1161 * @return <CODE>true</CODE> if the provided values are approximately equal, 1162 * or <CODE>false</CODE> if not. 1163 */ 1164 public boolean approximatelyMatch(ByteString value1, ByteString value2) 1165 { 1166 // If the values have been normalized, then we just need to compare their 1167 // byte arrays. 1168 return Arrays.equals(value1.value(), value2.value()); 1169 } 1170 1171 1172 1173 /** 1174 * Indicates whether the provided value has the given substring at the 1175 * specified position. 1176 * 1177 * @param value The value containing the range for which to make the 1178 * determination. 1179 * @param start The position in the value at which to start the 1180 * comparison. 1181 * @param substring The substring to compare against the specified value 1182 * range. 1183 * 1184 * @return <CODE>true</CODE> if the specified portion of the value matches 1185 * the given substring, or <CODE>false</CODE> if it does not. 1186 */ 1187 private boolean hasSubstring(String value, int start, 1188 String substring) 1189 { 1190 try 1191 { 1192 // This can happen since a lot of the rules "look behind" and 1193 // rightfully don't check if it's the first character 1194 if (start < 0) { 1195 return false; 1196 } 1197 1198 int end = start + substring.length(); 1199 1200 // value isn't big enough to do the comparison 1201 if (end > value.length()) 1202 { 1203 return false; 1204 } 1205 1206 for (int i=0,pos=start; pos < end; i++,pos++) 1207 { 1208 if (value.charAt(pos) != substring.charAt(i)) 1209 { 1210 return false; 1211 } 1212 } 1213 1214 return true; 1215 } 1216 catch (Exception e) 1217 { 1218 if (debugEnabled()) 1219 { 1220 TRACER.debugCaught(DebugLogLevel.ERROR, e); 1221 } 1222 1223 return false; 1224 } 1225 } 1226 1227 1228 1229 /** 1230 * Indicates whether the provided character is a vowel (including "Y"). 1231 * 1232 * @param c The character for which to make the determination. 1233 * 1234 * @return <CODE>true</CODE> if the provided character is a vowel, or 1235 * <CODE>false</CODE> if not. 1236 */ 1237 private boolean isVowel(char c) 1238 { 1239 switch (c) 1240 { 1241 case 'A': 1242 case 'E': 1243 case 'I': 1244 case 'O': 1245 case 'U': 1246 case 'Y': 1247 return true; 1248 1249 default: 1250 return false; 1251 } 1252 } 1253 1254 1255 1256 /** 1257 * Indicates whether the provided string appears to be Slavo-Germanic. 1258 * 1259 * @param s The string for which to make the determination. 1260 * 1261 * @return <CODE>true</CODE> if the provided string appears to be 1262 * Slavo-Germanic, or <CODE>false</CODE> if not. 1263 */ 1264 private boolean isSlavoGermanic(String s) 1265 { 1266 return (s.contains("W") || s.contains("K") || s.contains("CZ") || 1267 s.contains("WITZ")); 1268 } 1269 1270 1271 1272 /** 1273 * Indicates whether the provided string appears Germanic (starts with "VAN ", 1274 * "VON ", or "SCH"). 1275 * 1276 * @param s The string for which to make the determination. 1277 * 1278 * @return <CODE>true</CODE> if the provided string appears Germanic, or 1279 * <CODE>false</CODE> if not. 1280 */ 1281 private boolean isGermanic(String s) 1282 { 1283 return (s.startsWith("VAN ") || s.startsWith("VON ") || 1284 s.startsWith("SCH")); 1285 } 1286 } 1287