001    /*
002     * CDDL HEADER START
003     *
004     * The contents of this file are subject to the terms of the
005     * Common Development and Distribution License, Version 1.0 only
006     * (the "License").  You may not use this file except in compliance
007     * with the License.
008     *
009     * You can obtain a copy of the license at
010     * trunk/opends/resource/legal-notices/OpenDS.LICENSE
011     * or https://OpenDS.dev.java.net/OpenDS.LICENSE.
012     * See the License for the specific language governing permissions
013     * and limitations under the License.
014     *
015     * When distributing Covered Code, include this CDDL HEADER in each
016     * file and include the License file at
017     * trunk/opends/resource/legal-notices/OpenDS.LICENSE.  If applicable,
018     * add the following below this CDDL HEADER, with the fields enclosed
019     * by brackets "[]" replaced with your own identifying information:
020     *      Portions Copyright [yyyy] [name of copyright owner]
021     *
022     * CDDL HEADER END
023     *
024     *
025     *      Copyright 2006-2008 Sun Microsystems, Inc.
026     */
027    package org.opends.server.schema;
028    
029    
030    
031    import java.util.Arrays;
032    
033    import org.opends.server.admin.std.server.ApproximateMatchingRuleCfg;
034    import org.opends.server.api.ApproximateMatchingRule;
035    import org.opends.server.config.ConfigException;
036    import org.opends.server.protocols.asn1.ASN1OctetString;
037    import org.opends.server.types.ByteString;
038    import org.opends.server.types.DirectoryException;
039    import org.opends.server.types.InitializationException;
040    import org.opends.server.types.DebugLogLevel;
041    
042    import static org.opends.server.loggers.debug.DebugLogger.*;
043    import org.opends.server.loggers.debug.DebugTracer;
044    import static org.opends.server.schema.SchemaConstants.*;
045    
046    
047    
048    /**
049     * This class defines an approximate matching rule based on the Double Metaphone
050     * algorithm.  The Metaphone and Double Metaphone algorithms were originally
051     * devised by Lawrence Philips (published in the December 1990 issue of
052     * <I>Computer Language</I> and the
053     * <A HREF="http://www.cuj.com/documents/s=8038/cuj0006philips/">June 2000 issue
054     * of <I>C/C++ Users Journal</I></A>, respectively), and this version of the
055     * algorithm is based on a version modified by Kevin Atkinson to include
056     * bugfixes and additional functionality (source is available
057     * <A HREF="http://aspell.net/metaphone/dmetaph.cpp">here</A> and additional
058     * Metaphone and Double Metaphone information is available at
059     * <A HREF="http://aspell.net/metaphone/">http://aspell.net/metaphone/</A>).
060     * This implementation is largely the same as the one provided by Kevin
061     * Atkinson, but it has been re-written for better readability, for more
062     * efficiency, to get rid of checks for conditions that can't possibly happen,
063     * and to get rid of redundant checks that aren't needed.  It has also been
064     * updated to always only generate a single value rather than one or possibly
065     * two values.
066     */
067    public class DoubleMetaphoneApproximateMatchingRule
068           extends ApproximateMatchingRule
069    {
070      /**
071       * The tracer object for the debug logger.
072       */
073      private static final DebugTracer TRACER = getTracer();
074    
075    
076    
077      /**
078       * Creates a new instance of this double metaphone approximate matching rule.
079       */
080      public DoubleMetaphoneApproximateMatchingRule()
081      {
082        super();
083      }
084    
085    
086    
087      /**
088       * {@inheritDoc}
089       */
090      public void initializeMatchingRule(ApproximateMatchingRuleCfg configuration)
091             throws ConfigException, InitializationException
092      {
093        // No initialization is required.
094      }
095    
096    
097    
098      /**
099       * Retrieves the common name for this matching rule.
100       *
101       * @return  The common name for this matching rule, or <CODE>null</CODE> if
102       * it does not have a name.
103       */
104      public String getName()
105      {
106        return AMR_DOUBLE_METAPHONE_NAME;
107      }
108    
109    
110    
111      /**
112       * Retrieves the OID for this matching rule.
113       *
114       * @return  The OID for this matching rule.
115       */
116      public String getOID()
117      {
118        return AMR_DOUBLE_METAPHONE_OID;
119      }
120    
121    
122    
123      /**
124       * Retrieves the description for this matching rule.
125       *
126       * @return  The description for this matching rule, or <CODE>null</CODE> if
127       *          there is none.
128       */
129      public String getDescription()
130      {
131        // There is no standard description for this matching rule.
132        return AMR_DOUBLE_METAPHONE_DESCRIPTION;
133      }
134    
135    
136    
137      /**
138       * Retrieves the OID of the syntax with which this matching rule is
139       * associated.
140       *
141       * @return  The OID of the syntax with which this matching rule is associated.
142       */
143      public String getSyntaxOID()
144      {
145        // Approximate matching is really only appropriate for DirectoryString
146        // values.
147        return SYNTAX_DIRECTORY_STRING_OID;
148      }
149    
150    
151    
152      /**
153       * Retrieves the normalized form of the provided value, which is best suited
154       * for efficiently performing matching operations on that value.
155       *
156       * @param  value  The value to be normalized.
157       *
158       * @return  The normalized version of the provided value.
159       *
160       * @throws  DirectoryException  If the provided value is invalid according to
161       *                              the associated attribute syntax.
162       */
163      public ByteString normalizeValue(ByteString value)
164             throws DirectoryException
165      {
166        String valueString = value.stringValue();
167        int length = valueString.length();
168        if (length == 0)
169        {
170          // The value is empty, so it is already normalized.
171          return new ASN1OctetString();
172        }
173    
174        int last = length - 1;
175    
176    
177        // Pad the value to allow for checks to go past the end of the value.
178        valueString = valueString.toUpperCase() + "     ";
179    
180    
181        // The metaphone value that is being constructed.
182        StringBuilder metaphone = new StringBuilder(4);
183    
184    
185        // Skip over GN, KN, PN, WR, and PS at the beginning of a word.
186        int pos = 0;
187        String substring = valueString.substring(0, 2);
188        if (substring.equals("GN") || substring.equals("KN") ||
189            substring.equals("PN") || substring.equals("WR") ||
190            substring.equals("PS"))
191        {
192          pos++;
193        }
194    
195    
196        // 'X' at the beginning of a word will sound like Z, but Z will always be
197        // mapped to S.
198        else if (valueString.charAt(0) == 'X')
199        {
200          metaphone.append("S");
201          pos++;
202        }
203    
204    
205        // Loop until we have at least four metaphone characters or have reached the
206        // end of the string.
207        while ((metaphone.length() < 4) && (pos < length))
208        {
209          // Check the character at the current position against various targets.
210          char posMinusFour;
211          char posMinusThree;
212          char posMinusTwo;
213          char posMinusOne;
214          char posPlusOne;
215          char posPlusTwo;
216          switch (valueString.charAt(pos))
217          {
218            case 'A':
219            case 'E':
220            case 'I':
221            case 'O':
222            case 'U':
223            case 'Y':
224              // All initial vowels map to 'A'.  All others will be ignored.
225              if (pos == 0)
226              {
227                metaphone.append("A");
228              }
229    
230              pos++;
231              break;
232    
233    
234            case 'B':
235              // B and BB will be mapped to P, with the exception of "MB" as in
236              // "crumb", but that will be handled elsewhere.
237              metaphone.append("P");
238    
239              if (valueString.charAt(++pos) == 'B')
240              {
241                pos++;
242              }
243    
244              break;
245    
246    
247            case 'C':
248              // Check for various Germanic sequences, which will be mapped to 'K'.
249              // This basically includes all occurrences of "ACH" where the
250              // preceding character is not a vowel and the following character is
251              // neither an 'E' nor an 'I' except in "BACHER" and "MACHER".
252              if ((pos > 1) &&
253                  (! isVowel(posMinusTwo = valueString.charAt(pos-2))) &&
254                  hasSubstring(valueString, pos-1, "ACH") &&
255                  ((posPlusTwo = valueString.charAt(pos+2)) != 'I') &&
256                  ((posPlusTwo != 'E') ||
257                   ((valueString.charAt(pos+3) == 'R') &&
258                    ((posMinusTwo == 'B') || (posMinusTwo == 'M')))))
259              {
260                metaphone.append("K");
261                pos += 2;
262                break;
263              }
264    
265    
266              // Check for a special case of "caesar", which will be maped to 'S'.
267              if ((pos == 0) && hasSubstring(valueString, pos+1, "AESAR"))
268              {
269                metaphone.append("S");
270                pos += 2;
271                break;
272              }
273    
274    
275              // CH can be treated in lots of different ways.
276              if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
277              {
278                // Check for "chia" as in "chianti" and map to 'K'.
279                if (hasSubstring(valueString, pos+2, "IA"))
280                {
281                  metaphone.append("K");
282                  pos += 2;
283                  break;
284                }
285    
286                // Check for "chae" as in "michael" and map to 'K'.
287                if (hasSubstring(valueString, pos+2, "AE"))
288                {
289                  metaphone.append("K");
290                  pos += 2;
291                  break;
292                }
293    
294                // Check for a Greek root at the beginning of the value like
295                // chemistry or chorus and map to 'K'.
296                if ((pos == 0) && (! hasSubstring(valueString, 2, "ORE")) &&
297                    (hasSubstring(valueString, 2, "ARAC") ||
298                     hasSubstring(valueString, 2, "ARIS") ||
299                     hasSubstring(valueString, 2, "OR") ||
300                     hasSubstring(valueString, 2, "YM") ||
301                     hasSubstring(valueString, 2, "IA") ||
302                     hasSubstring(valueString, 2, "EM")))
303                {
304                  metaphone.append("K");
305                  pos += 2;
306                  break;
307                }
308    
309    
310                // Check for "CH" values that produce a "KH" sound that will be
311                // mapped to 'K'.
312                if (isGermanic(valueString) ||
313                    hasSubstring(valueString, pos-2, "ORCHES") ||
314                    hasSubstring(valueString, pos-2, "ARCHIT") ||
315                    hasSubstring(valueString, pos-2, "ORCHID") ||
316                    ((posPlusTwo = valueString.charAt(pos+2)) == 'T') ||
317                    (posPlusTwo == 'S') ||
318                    (((pos == 0) ||
319                     (((posMinusOne = valueString.charAt(pos-1)) == 'A') ||
320                       (posMinusOne == 'O') || (posMinusOne == 'U') ||
321                       (posMinusOne == 'E'))) &&
322                     ((posPlusTwo == 'L') || (posPlusTwo == 'R') ||
323                      (posPlusTwo == 'N')|| (posPlusTwo == 'M') ||
324                      (posPlusTwo == 'B')|| (posPlusTwo == 'H') ||
325                      (posPlusTwo == 'F')|| (posPlusTwo == 'V') ||
326                      (posPlusTwo == 'W'))))
327                {
328                  metaphone.append("K");
329                  pos += 2;
330                  break;
331                }
332    
333    
334                // All other "CH" values.
335                if (pos > 0)
336                {
337                  if (hasSubstring(valueString, 0, "MC"))
338                  {
339                    metaphone.append("K");
340                  }
341                  else
342                  {
343                    metaphone.append("X");
344                  }
345                }
346                else
347                {
348                  metaphone.append("X");
349                }
350    
351                pos += 2;
352                break;
353              }
354    
355    
356              // Check for "CZ" as in "czerny" but not "wicz" and map to 'S'.
357              if ((posPlusOne == 'Z') &&
358                  (! hasSubstring(valueString, pos-2, "WI")))
359              {
360                metaphone.append("S");
361                pos += 2;
362                break;
363              }
364    
365    
366              // Check for "CIA" as in "focaccia" and map to 'X'.
367              if ((posPlusOne == 'I') && (valueString.charAt(pos+2) == 'A'))
368              {
369                metaphone.append("X");
370                pos += 3;
371                break;
372              }
373    
374    
375              // Check for a double C but not in values that start with "McC"
376              if ((posPlusOne == 'C') &&
377                  (! ((pos == 1) && valueString.charAt(0) == 'M')))
378              {
379                if ((((posPlusTwo = valueString.charAt(pos+2)) == 'I') ||
380                     (posPlusTwo == 'E') || (posPlusTwo == 'H')) &&
381                    (! ((posPlusTwo == 'H') && valueString.charAt(pos+3) == 'U')))
382                {
383                  if (((pos == 1) && (valueString.charAt(pos-1) == 'A')) ||
384                      hasSubstring(valueString, pos-1, "UCCEE") ||
385                      hasSubstring(valueString, pos-1, "UCCES"))
386                  {
387                    // Values like "accident", "accede", and "succeed".
388                    metaphone.append("K");
389                    pos += 2;
390                    break;
391                  }
392                  else
393                  {
394                    // Values like "bacci" or "bertucci".
395                    metaphone.append("X");
396                    pos += 3;
397                    break;
398                  }
399                }
400                else
401                {
402                  // This is Pierce's Rule, whatever that means.
403                  metaphone.append("K");
404                  pos += 2;
405                  break;
406                }
407              }
408    
409    
410              // Check for CK, CG, or CQ and map to 'K'.  Check for CI, CE, and CY
411              // and map to "S".
412              if (((posPlusOne = valueString.charAt(pos+1)) == 'K') ||
413                  (posPlusOne == 'G') || (posPlusOne == 'Q'))
414              {
415                metaphone.append("K");
416                pos += 2;
417                break;
418              }
419    
420    
421              // Check for CI, CE, or CY and map to 'S'.
422              if ((posPlusOne == 'I') || (posPlusOne == 'E') || (posPlusOne == 'Y'))
423              {
424                metaphone.append("S");
425                pos += 2;
426                break;
427              }
428    
429    
430              // All other cases of "C" will be mapped to 'K'.  However, the number
431              // of positions that we skip ahead may vary.  If there is a value that
432              // consists of two words like "mac caffrey", then skip ahead three.
433              // For the character combinations of "CK" and "CQ", then skip ahead
434              // two.  For the character combinations of "CC" except "CCE" and
435              // "CCI", then skip ahead two.  For all other cases, skip ahead one.
436              metaphone.append("K");
437              switch (valueString.charAt(pos+1))
438              {
439                case ' ':
440                  switch (valueString.charAt(pos+2))
441                  {
442                    case 'C':
443                    case 'Q':
444                    case 'G':
445                      pos += 3;
446                      break;
447                    default:
448                      pos++;
449                      break;
450                  }
451                  break;
452    
453                case 'K':
454                case 'Q':
455                  pos += 2;
456                  break;
457    
458                case 'C':
459                  switch (valueString.charAt(pos+2))
460                  {
461                    case 'E':
462                    case 'I':
463                      pos++;
464                      break;
465                    default:
466                      pos += 2;
467                      break;
468                  }
469                  break;
470                default:
471                  pos++;
472              }
473              break;
474    
475    
476            case 'D':
477              // DG will be mapped to either 'J' (in cases like edge) or 'TK' (in
478              // cases like Edgar).
479              if ((posPlusOne = valueString.charAt(pos+1)) == 'G')
480              {
481                if (((posPlusTwo = valueString.charAt(pos+2)) == 'I') ||
482                    (posPlusTwo == 'E') || (posPlusTwo == 'Y'))
483                {
484                  metaphone.append("J");
485                  pos += 3;
486                  break;
487                }
488                else
489                {
490                  metaphone.append("TK");
491                  pos += 2;
492                  break;
493                }
494              }
495    
496    
497              // DT and DD will be mapped to 'T'.
498              if ((posPlusOne == 'T') || (posPlusOne == 'D'))
499              {
500                metaphone.append("T");
501                pos += 2;
502                break;
503              }
504    
505    
506              // All other cases will be mapped to 'T'.
507              metaphone.append("T");
508              pos++;
509              break;
510    
511    
512            case 'F':
513              // F always maps to F.  If there is a double F, then skip the second
514              // one.
515              metaphone.append("F");
516              pos++;
517              if (valueString.charAt(pos) == 'F')
518              {
519                pos++;
520              }
521              break;
522    
523    
524            case 'G':
525              if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
526              {
527                // A "GH" that is not preceded by a vowel will be mapped to 'K'.
528                if ((pos > 0) && (! isVowel(valueString.charAt(pos-1))))
529                {
530                  metaphone.append("K");
531                  pos += 2;
532                  break;
533                }
534    
535                if (pos == 0)
536                {
537                  if (valueString.charAt(pos+2) == 'I')
538                  {
539                    // Words like ghislane or ghiradelli
540                    metaphone.append("J");
541                  }
542                  else
543                  {
544                    metaphone.append("K");
545                  }
546    
547                  pos += 2;
548                  break;
549                }
550    
551                // A refined version of Parker's Rule.
552                if (((pos > 1) &&
553                     (((posMinusTwo = valueString.charAt(pos-2)) == 'B') ||
554                      (posMinusTwo == 'H') || (posMinusTwo == 'D'))) ||
555                    ((pos > 2) &&
556                     (((posMinusThree = valueString.charAt(pos-3)) == 'B') ||
557                      (posMinusThree == 'H') || (posMinusThree == 'D'))) ||
558                    ((pos > 3) &&
559                     (((posMinusFour = valueString.charAt(pos-4)) == 'B') ||
560                      (posMinusFour == 'H'))))
561                {
562                  pos += 2;
563                  break;
564                }
565                else
566                {
567                  if ((pos > 2) && (valueString.charAt(pos-1) == 'U') &&
568                      (((posMinusThree = valueString.charAt(pos-3)) == 'C') ||
569                       (posMinusThree == 'G') || (posMinusThree == 'L') ||
570                       (posMinusThree == 'R') || (posMinusThree == 'T')))
571                  {
572                    // Words like laugh, McLaughlin, cough, rough are mapped to 'F'.
573                    metaphone.append("F");
574                  }
575                  else if ((pos > 0) && (valueString.charAt(pos-1) != 'I'))
576                  {
577                    metaphone.append("K");
578                  }
579    
580                  pos += 2;
581                  break;
582                }
583              }
584    
585    
586              if (posPlusOne == 'N')
587              {
588                if ((pos == 1) && isVowel(valueString.charAt(0)) &&
589                    (! isSlavoGermanic(valueString)))
590                {
591                  metaphone.append("KN");
592                  pos += 2;
593                  break;
594                }
595                else
596                {
597                  if ((! hasSubstring(valueString, pos+2, "EY")) &&
598                      (! isSlavoGermanic(valueString)))
599                  {
600                    metaphone.append("N");
601                  }
602                  else
603                  {
604                    metaphone.append("KN");
605                  }
606    
607                  pos += 2;
608                  break;
609                }
610              }
611    
612    
613              // GLI as in tagliaro will be mapped to "KL".
614              if ((posPlusOne == 'L') && (valueString.charAt(pos+2) == 'I'))
615              {
616                metaphone.append("KL");
617                pos += 2;
618                break;
619              }
620    
621    
622              // Forms of GY, GE, and GI at the beginning of a word will map to 'K'.
623              if ((pos == 0) &&
624                  ((posPlusOne == 'Y') ||
625                   (substring = valueString.substring(pos+1,pos+3)).equals("ES") ||
626                   substring.equals("EP") || substring.equals("EB") ||
627                   substring.equals("EL") || substring.equals("EY") ||
628                   substring.equals("IB") || substring.equals("IL") ||
629                   substring.equals("IN") || substring.equals("IE") ||
630                   substring.equals("EI") || substring.equals("ER")))
631              {
632                metaphone.append("K");
633                pos += 2;
634                break;
635              }
636    
637    
638              // Some occurrences of GER and GY in a word will be mapped to 'K'.
639              posPlusTwo = valueString.charAt(pos+2);
640              if ((((posPlusOne == 'E') && (posPlusTwo == 'R')) ||
641                  (posPlusOne == 'Y')) &&
642                  ((posMinusOne = valueString.charAt(pos-1)) != 'E') &&
643                  (posMinusOne != 'I') &&
644                  (! hasSubstring(valueString, 0, "DANGER")) &&
645                  (! hasSubstring(valueString, 0, "RANGER")) &&
646                  (! hasSubstring(valueString, 0, "MANGER")) &&
647                  (! hasSubstring(valueString, pos-1, "RGY")) &&
648                  (! hasSubstring(valueString, pos-1, "OGY")))
649              {
650                metaphone.append("K");
651                pos += 2;
652                break;
653              }
654    
655    
656              // Check for Italian uses like 'biaggi" and map to 'J'.
657              if ((posPlusOne == 'E') || (posPlusOne == 'I') ||
658                  (posPlusOne == 'Y') ||
659                  hasSubstring(valueString, pos-1, "AGGI") ||
660                  hasSubstring(valueString, pos-1, "OGGI"))
661              {
662                // Germanic uses will be mapped to 'K'.
663                if (isGermanic(valueString) ||
664                    hasSubstring(valueString, pos+1, "ET"))
665                {
666                  metaphone.append("K");
667                }
668                else
669                {
670                  metaphone.append("J");
671                }
672    
673                pos += 2;
674                break;
675              }
676    
677    
678              // All other cases will be mapped to 'K'.  If there is a double G,
679              // then skip two.  Otherwise, just skip one.
680              metaphone.append("K");
681              pos++;
682    
683              if (posPlusOne == 'G')
684              {
685                pos++;
686              }
687    
688              break;
689    
690    
691            case 'H':
692              // The letter 'H' will only be processed if it is immediately followed
693              // by a vowel and is either the start of the word or preceded by a
694              // vowel.
695              if (isVowel(valueString.charAt(pos+1)))
696              {
697                if ((pos == 0) || isVowel(valueString.charAt(pos-1)))
698                {
699                  metaphone.append("H");
700                  pos++;
701                }
702              }
703    
704              pos++;
705              break;
706    
707    
708            case 'J':
709              // Take care of obvious Spanish uses that should map to 'H'.
710              if (hasSubstring(valueString, 0, "SAN "))
711              {
712                metaphone.append("H");
713                pos++;
714                break;
715              }
716    
717              if (hasSubstring(valueString, pos, "JOSE"))
718              {
719                if ((pos == 0) && (valueString.charAt(pos+4) == ' '))
720                {
721                  metaphone.append("H");
722                }
723                else
724                {
725                  metaphone.append("J");
726                }
727    
728                pos++;
729                break;
730              }
731    
732    
733              // All other cases will be mapped to 'J'.
734              metaphone.append("J");
735    
736              if (valueString.charAt(pos+1) == 'J')
737              {
738                pos++;
739              }
740    
741              pos++;
742              break;
743    
744    
745            case 'K':
746              // 'K' will always be mapped to 'K'.  KK will be treated like K.
747              metaphone.append("K");
748    
749              if (valueString.charAt(pos+1) == 'K')
750              {
751                pos++;
752              }
753    
754              pos++;
755              break;
756    
757    
758            case 'L':
759              // 'L' will always be mapped to 'L'.  LL will be treated like L, even
760              // for potential Spanish uses.
761              metaphone.append("L");
762    
763              if (valueString.charAt(pos+1) == 'L')
764              {
765                pos++;
766              }
767    
768              pos++;
769              break;
770    
771    
772            case 'M':
773              // 'M' will always be mapped to 'M'.  MM will be treated like M.
774              // UMB in cases like "dumb" and "thumb" will be treated like M.
775              metaphone.append("M");
776    
777              if (valueString.charAt(pos+1) == 'M')
778              {
779                pos++;
780              }
781              else if (hasSubstring(valueString, pos-1, "UMB"))
782              {
783                if (((pos+1) == last) ||
784                    hasSubstring(valueString, pos+2, "ER"))
785                {
786                  pos++;
787                }
788              }
789    
790              pos++;
791              break;
792    
793    
794            case 'N':
795              // 'N' will always be mapped to 'N'.  NN will be treated like N.
796              metaphone.append("N");
797    
798              if (valueString.charAt(pos+1) == 'N')
799              {
800                pos++;
801              }
802    
803              pos++;
804              break;
805    
806    
807            case 'P':
808              // PH will be mapped to 'F'.
809              if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
810              {
811                metaphone.append("F");
812                pos += 2;
813                break;
814              }
815    
816    
817              // All other cases will be mapped to 'P', with PP and PB being treated
818              // like P.
819              metaphone.append("P");
820    
821              if ((posPlusOne == 'P') || (posPlusOne == 'B'))
822              {
823                pos++;
824              }
825    
826              pos++;
827              break;
828    
829    
830            case 'Q':
831              // 'Q' will always be mapped to 'K'.  QQ will be treated like Q.
832              metaphone.append("K");
833    
834              if (valueString.charAt(pos+1) == 'Q')
835              {
836                pos++;
837              }
838    
839              pos++;
840              break;
841    
842    
843            case 'R':
844              // Ignore R at the end of French words.
845              if ((pos == last) && (! isSlavoGermanic(valueString)) &&
846                  hasSubstring(valueString, pos-2, "IE") &&
847                  (! hasSubstring(valueString, pos-4, "ME")) &&
848                  (! hasSubstring(valueString, pos-4, "MA")))
849              {
850                pos++;
851                break;
852              }
853    
854    
855              // All other cases will be mapped to 'R', with RR treated like R.
856              metaphone.append("R");
857    
858              if (valueString.charAt(pos+1) == 'R')
859              {
860                pos++;
861              }
862    
863              pos++;
864              break;
865    
866    
867            case 'S':
868              // Special cases like isle and carlysle will be silent.
869              if (hasSubstring(valueString, pos-1, "ISL") ||
870                  hasSubstring(valueString, pos-1, "YSL"))
871              {
872                pos++;
873                break;
874              }
875    
876    
877              // Special case of sugar mapped to 'X'.
878              if (hasSubstring(valueString, pos+1, "UGAR"))
879              {
880                metaphone.append("X");
881                pos++;
882                break;
883              }
884    
885    
886              // SH is generally mapped to 'X', but not in Germanic cases.
887              if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
888              {
889                if (hasSubstring(valueString, pos+1, "HEIM") ||
890                    hasSubstring(valueString, pos+1, "HOEK") ||
891                    hasSubstring(valueString, pos+1, "HOLM") ||
892                    hasSubstring(valueString, pos+1, "HOLZ"))
893                {
894                  metaphone.append("S");
895                }
896                else
897                {
898                  metaphone.append("X");
899                }
900    
901                pos += 2;
902                break;
903              }
904    
905    
906              // Italian and Armenian cases will map to "S".
907              if (hasSubstring(valueString, pos+1, "IO") ||
908                  hasSubstring(valueString, pos+1, "IA"))
909              {
910                metaphone.append("S");
911                pos += 3;
912                break;
913              }
914    
915    
916              // SZ should be mapped to 'S'.
917              if (posPlusOne == 'Z')
918              {
919                metaphone.append("S");
920                pos += 2;
921                break;
922              }
923    
924    
925              // Various combinations at the beginning of words will be mapped to
926              // 'S'.
927              if ((pos == 0) &&
928                  ((posPlusOne == 'M') || (posPlusOne == 'N') ||
929                   (posPlusOne == 'L') || (posPlusOne == 'W')))
930              {
931                metaphone.append("S");
932                pos++;
933                break;
934              }
935    
936    
937              // SC should be mapped to either SK, X, or S.
938              if (posPlusOne == 'C')
939              {
940                if ((posPlusTwo = valueString.charAt(pos+2)) == 'H')
941                {
942                  if (hasSubstring(valueString, pos+3, "OO") ||
943                      hasSubstring(valueString, pos+3, "UY") ||
944                      hasSubstring(valueString, pos+3, "ED") ||
945                      hasSubstring(valueString, pos+3, "EM"))
946                  {
947                    metaphone.append("SK");
948                  }
949                  else
950                  {
951                    metaphone.append("X");
952                  }
953    
954                  pos += 3;
955                  break;
956                }
957    
958                if ((posPlusTwo == 'I') || (posPlusTwo == 'E') ||
959                    (posPlusTwo == 'Y'))
960                {
961                  metaphone.append("S");
962                  pos += 3;
963                  break;
964                }
965    
966                metaphone.append("SK");
967                pos += 3;
968                break;
969              }
970    
971    
972              // Ignore a trailing S in French words.  All others will be mapped to
973              // 'S'.
974              if (! ((pos == last) &&
975                     (hasSubstring(valueString, pos-2, "AI") ||
976                      hasSubstring(valueString, pos-2, "OI"))))
977              {
978                metaphone.append("S");
979              }
980    
981              if ((posPlusOne == 'S') || (posPlusOne == 'Z'))
982              {
983                pos++;
984              }
985    
986              pos++;
987              break;
988    
989    
990            case 'T':
991              // "TION", "TIA", and "TCH" will be mapped to 'X'.
992              if (hasSubstring(valueString, pos, "TION") ||
993                  hasSubstring(valueString, pos, "TIA") ||
994                  hasSubstring(valueString, pos, "TCH"))
995              {
996                metaphone.append("X");
997                pos += 3;
998                break;
999              }
1000    
1001    
1002              // TH or TTH  will be mapped to either T (for Germanic cases) or
1003              // 0 (zero) for the rest.
1004              if (((posPlusOne = valueString.charAt(pos+1)) == 'H') ||
1005                  ((posPlusOne == 'T') && (valueString.charAt(pos+2) == 'H')))
1006              {
1007                if (isGermanic(valueString) ||
1008                    hasSubstring(valueString, pos+2, "OM") ||
1009                    hasSubstring(valueString, pos+2, "AM"))
1010                {
1011                  metaphone.append("T");
1012                }
1013                else
1014                {
1015                  metaphone.append("0");
1016                }
1017    
1018                pos += 2;
1019                break;
1020              }
1021    
1022    
1023              // All other cases will map to T, with TT and TD being treated like T.
1024              metaphone.append("T");
1025    
1026              if ((posPlusOne == 'T') || (posPlusOne == 'D'))
1027              {
1028                pos++;
1029              }
1030    
1031              pos++;
1032              break;
1033    
1034    
1035            case 'V':
1036              // 'V' will always be mapped to 'F', with VV treated like V.
1037              metaphone.append("F");
1038    
1039              if (valueString.charAt(pos+1) == 'V')
1040              {
1041                pos++;
1042              }
1043    
1044              pos++;
1045              break;
1046    
1047    
1048            case 'W':
1049              // WR should always map to R.
1050              if ((posPlusOne = valueString.charAt(pos+1)) == 'R')
1051              {
1052                metaphone.append("R");
1053                pos += 2;
1054                break;
1055              }
1056    
1057    
1058              // W[AEIOUYH] at the beginning of the word should be mapped to A.
1059              if ((pos == 0) && (isVowel(posPlusOne) || (posPlusOne == 'H')))
1060              {
1061                metaphone.append("A");
1062    
1063                // FIXME -- This isn't in the algorithm as written.  Should it be?
1064                pos += 2;
1065                break;
1066              }
1067    
1068    
1069              // A Polish value like WICZ or WITZ should be mapped to TS.
1070              if (hasSubstring(valueString, pos+1, "WICZ") ||
1071                  hasSubstring(valueString, pos+1, "WITZ"))
1072              {
1073                metaphone.append("TS");
1074                pos += 4;
1075                break;
1076              }
1077    
1078    
1079              // Otherwise, we'll just skip it.
1080              pos++;
1081              break;
1082    
1083    
1084            case 'X':
1085              // X maps to KS except at the end of French words.
1086              if (! ((pos == last) &&
1087                     (hasSubstring(valueString, pos-3, "IAU") ||
1088                      hasSubstring(valueString, pos-3, "EAU") ||
1089                      hasSubstring(valueString, pos-2, "AU") ||
1090                      hasSubstring(valueString, pos-2, "OU"))))
1091              {
1092                metaphone.append("KS");
1093              }
1094    
1095              if (((posPlusOne = valueString.charAt(pos+1)) == 'C') ||
1096                  (posPlusOne == 'X'))
1097              {
1098                pos++;
1099              }
1100    
1101              pos++;
1102              break;
1103    
1104    
1105            case 'Z':
1106              // Chinese usages like zhao will map to J.
1107              if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
1108              {
1109                metaphone.append("J");
1110                pos += 2;
1111                break;
1112              }
1113    
1114    
1115              // All other cases map to "S".  ZZ will be treated like Z.
1116              metaphone.append("S");
1117    
1118              if (posPlusOne == 'Z')
1119              {
1120                pos++;
1121              }
1122    
1123              pos++;
1124              break;
1125    
1126    
1127            case '\u00C7': // C with a cedilla
1128              // This will always be mapped to 'S'.
1129              metaphone.append("S");
1130              pos++;
1131              break;
1132    
1133    
1134            case '\u00D1': // N with a tilde
1135              // This will always be mapped to 'N'.
1136              metaphone.append("N");
1137              pos++;
1138              break;
1139    
1140    
1141            default:
1142              // We don't have any special treatment for this character, so skip it.
1143              pos++;
1144              break;
1145          }
1146        }
1147    
1148    
1149        return new ASN1OctetString(metaphone.toString());
1150      }
1151    
1152    
1153    
1154      /**
1155       * Indicates whether the two provided normalized values are approximately
1156       * equal to each other.
1157       *
1158       * @param  value1  The normalized form of the first value to compare.
1159       * @param  value2  The normalized form of the second value to compare.
1160       *
1161       * @return  <CODE>true</CODE> if the provided values are approximately equal,
1162       *          or <CODE>false</CODE> if not.
1163       */
1164      public boolean approximatelyMatch(ByteString value1, ByteString value2)
1165      {
1166        // If the values have been normalized, then we just need to compare their
1167        // byte arrays.
1168        return Arrays.equals(value1.value(), value2.value());
1169      }
1170    
1171    
1172    
1173      /**
1174       * Indicates whether the provided value has the given substring at the
1175       * specified position.
1176       *
1177       * @param  value      The value containing the range for which to make the
1178       *                    determination.
1179       * @param  start      The position in the value at which to start the
1180       *                    comparison.
1181       * @param  substring  The substring to compare against the specified value
1182       *                    range.
1183       *
1184       * @return  <CODE>true</CODE> if the specified portion of the value matches
1185       *          the given substring, or <CODE>false</CODE> if it does not.
1186       */
1187      private boolean hasSubstring(String value, int start,
1188                                   String substring)
1189      {
1190        try
1191        {
1192          // This can happen since a lot of the rules "look behind" and
1193          // rightfully don't check if it's the first character
1194          if (start < 0) {
1195            return false;
1196          }
1197    
1198          int end = start + substring.length();
1199    
1200          // value isn't big enough to do the comparison
1201          if (end > value.length())
1202          {
1203            return false;
1204          }
1205    
1206          for (int i=0,pos=start; pos < end; i++,pos++)
1207          {
1208            if (value.charAt(pos) != substring.charAt(i))
1209            {
1210              return false;
1211            }
1212          }
1213    
1214          return true;
1215        }
1216        catch (Exception e)
1217        {
1218          if (debugEnabled())
1219          {
1220            TRACER.debugCaught(DebugLogLevel.ERROR, e);
1221          }
1222    
1223          return false;
1224        }
1225      }
1226    
1227    
1228    
1229      /**
1230       * Indicates whether the provided character is a vowel (including "Y").
1231       *
1232       * @param  c  The character for which to make the determination.
1233       *
1234       * @return  <CODE>true</CODE> if the provided character is a vowel, or
1235       *          <CODE>false</CODE> if not.
1236       */
1237      private boolean isVowel(char c)
1238      {
1239        switch (c)
1240        {
1241          case 'A':
1242          case 'E':
1243          case 'I':
1244          case 'O':
1245          case 'U':
1246          case 'Y':
1247            return true;
1248    
1249          default:
1250            return false;
1251        }
1252      }
1253    
1254    
1255    
1256      /**
1257       * Indicates whether the provided string appears to be Slavo-Germanic.
1258       *
1259       * @param  s  The string for which to make the determination.
1260       *
1261       * @return  <CODE>true</CODE> if the provided string appears to be
1262       *          Slavo-Germanic, or <CODE>false</CODE> if not.
1263       */
1264      private boolean isSlavoGermanic(String s)
1265      {
1266        return (s.contains("W") || s.contains("K") || s.contains("CZ") ||
1267                s.contains("WITZ"));
1268      }
1269    
1270    
1271    
1272      /**
1273       * Indicates whether the provided string appears Germanic (starts with "VAN ",
1274       * "VON ", or "SCH").
1275       *
1276       * @param  s  The string for which to make the determination.
1277       *
1278       * @return  <CODE>true</CODE> if the provided string appears Germanic, or
1279       *          <CODE>false</CODE> if not.
1280       */
1281      private boolean isGermanic(String s)
1282      {
1283        return (s.startsWith("VAN ") || s.startsWith("VON ") ||
1284                s.startsWith("SCH"));
1285      }
1286    }
1287