/*
|
* CDDL HEADER START
|
*
|
* The contents of this file are subject to the terms of the
|
* Common Development and Distribution License, Version 1.0 only
|
* (the "License"). You may not use this file except in compliance
|
* with the License.
|
*
|
* You can obtain a copy of the license at
|
* trunk/opends/resource/legal-notices/OpenDS.LICENSE
|
* or https://OpenDS.dev.java.net/OpenDS.LICENSE.
|
* See the License for the specific language governing permissions
|
* and limitations under the License.
|
*
|
* When distributing Covered Code, include this CDDL HEADER in each
|
* file and include the License file at
|
* trunk/opends/resource/legal-notices/OpenDS.LICENSE. If applicable,
|
* add the following below this CDDL HEADER, with the fields enclosed
|
* by brackets "[]" replaced with your own identifying information:
|
* Portions Copyright [yyyy] [name of copyright owner]
|
*
|
* CDDL HEADER END
|
*
|
*
|
* Copyright 2006-2008 Sun Microsystems, Inc.
|
*/
|
package org.opends.server.schema;
|
|
|
|
import java.util.Arrays;
|
|
import java.util.Collection;
|
import java.util.Collections;
|
import org.opends.server.api.ApproximateMatchingRule;
|
import org.opends.server.protocols.asn1.ASN1OctetString;
|
import org.opends.server.types.ByteString;
|
import org.opends.server.types.DirectoryException;
|
import org.opends.server.types.DebugLogLevel;
|
|
import static org.opends.server.loggers.debug.DebugLogger.*;
|
import org.opends.server.loggers.debug.DebugTracer;
|
import static org.opends.server.schema.SchemaConstants.*;
|
|
|
|
/**
|
* This class defines an approximate matching rule based on the Double Metaphone
|
* algorithm. The Metaphone and Double Metaphone algorithms were originally
|
* devised by Lawrence Philips (published in the December 1990 issue of
|
* <I>Computer Language</I> and the
|
* <A HREF="http://www.cuj.com/documents/s=8038/cuj0006philips/">June 2000 issue
|
* of <I>C/C++ Users Journal</I></A>, respectively), and this version of the
|
* algorithm is based on a version modified by Kevin Atkinson to include
|
* bugfixes and additional functionality (source is available
|
* <A HREF="http://aspell.net/metaphone/dmetaph.cpp">here</A> and additional
|
* Metaphone and Double Metaphone information is available at
|
* <A HREF="http://aspell.net/metaphone/">http://aspell.net/metaphone/</A>).
|
* This implementation is largely the same as the one provided by Kevin
|
* Atkinson, but it has been re-written for better readability, for more
|
* efficiency, to get rid of checks for conditions that can't possibly happen,
|
* and to get rid of redundant checks that aren't needed. It has also been
|
* updated to always only generate a single value rather than one or possibly
|
* two values.
|
*/
|
class DoubleMetaphoneApproximateMatchingRule
|
extends ApproximateMatchingRule
|
{
|
/**
|
* The tracer object for the debug logger.
|
*/
|
private static final DebugTracer TRACER = getTracer();
|
|
|
|
/**
|
* Creates a new instance of this double metaphone approximate matching rule.
|
*/
|
public DoubleMetaphoneApproximateMatchingRule()
|
{
|
super();
|
}
|
|
|
|
/**
|
* {@inheritDoc}
|
*/
|
public Collection<String> getAllNames()
|
{
|
return Collections.singleton(getName());
|
}
|
|
|
|
/**
|
* Retrieves the common name for this matching rule.
|
*
|
* @return The common name for this matching rule, or <CODE>null</CODE> if
|
* it does not have a name.
|
*/
|
public String getName()
|
{
|
return AMR_DOUBLE_METAPHONE_NAME;
|
}
|
|
|
|
/**
|
* Retrieves the OID for this matching rule.
|
*
|
* @return The OID for this matching rule.
|
*/
|
public String getOID()
|
{
|
return AMR_DOUBLE_METAPHONE_OID;
|
}
|
|
|
|
/**
|
* Retrieves the description for this matching rule.
|
*
|
* @return The description for this matching rule, or <CODE>null</CODE> if
|
* there is none.
|
*/
|
public String getDescription()
|
{
|
// There is no standard description for this matching rule.
|
return AMR_DOUBLE_METAPHONE_DESCRIPTION;
|
}
|
|
|
|
/**
|
* Retrieves the OID of the syntax with which this matching rule is
|
* associated.
|
*
|
* @return The OID of the syntax with which this matching rule is associated.
|
*/
|
public String getSyntaxOID()
|
{
|
// Approximate matching is really only appropriate for DirectoryString
|
// values.
|
return SYNTAX_DIRECTORY_STRING_OID;
|
}
|
|
|
|
/**
|
* Retrieves the normalized form of the provided value, which is best suited
|
* for efficiently performing matching operations on that value.
|
*
|
* @param value The value to be normalized.
|
*
|
* @return The normalized version of the provided value.
|
*
|
* @throws DirectoryException If the provided value is invalid according to
|
* the associated attribute syntax.
|
*/
|
public ByteString normalizeValue(ByteString value)
|
throws DirectoryException
|
{
|
String valueString = value.stringValue();
|
int length = valueString.length();
|
if (length == 0)
|
{
|
// The value is empty, so it is already normalized.
|
return new ASN1OctetString();
|
}
|
|
int last = length - 1;
|
|
|
// Pad the value to allow for checks to go past the end of the value.
|
valueString = valueString.toUpperCase() + " ";
|
|
|
// The metaphone value that is being constructed.
|
StringBuilder metaphone = new StringBuilder(4);
|
|
|
// Skip over GN, KN, PN, WR, and PS at the beginning of a word.
|
int pos = 0;
|
String substring = valueString.substring(0, 2);
|
if (substring.equals("GN") || substring.equals("KN") ||
|
substring.equals("PN") || substring.equals("WR") ||
|
substring.equals("PS"))
|
{
|
pos++;
|
}
|
|
|
// 'X' at the beginning of a word will sound like Z, but Z will always be
|
// mapped to S.
|
else if (valueString.charAt(0) == 'X')
|
{
|
metaphone.append("S");
|
pos++;
|
}
|
|
|
// Loop until we have at least four metaphone characters or have reached the
|
// end of the string.
|
while ((metaphone.length() < 4) && (pos < length))
|
{
|
// Check the character at the current position against various targets.
|
char posMinusFour;
|
char posMinusThree;
|
char posMinusTwo;
|
char posMinusOne;
|
char posPlusOne;
|
char posPlusTwo;
|
switch (valueString.charAt(pos))
|
{
|
case 'A':
|
case 'E':
|
case 'I':
|
case 'O':
|
case 'U':
|
case 'Y':
|
// All initial vowels map to 'A'. All others will be ignored.
|
if (pos == 0)
|
{
|
metaphone.append("A");
|
}
|
|
pos++;
|
break;
|
|
|
case 'B':
|
// B and BB will be mapped to P, with the exception of "MB" as in
|
// "crumb", but that will be handled elsewhere.
|
metaphone.append("P");
|
|
if (valueString.charAt(++pos) == 'B')
|
{
|
pos++;
|
}
|
|
break;
|
|
|
case 'C':
|
// Check for various Germanic sequences, which will be mapped to 'K'.
|
// This basically includes all occurrences of "ACH" where the
|
// preceding character is not a vowel and the following character is
|
// neither an 'E' nor an 'I' except in "BACHER" and "MACHER".
|
if ((pos > 1) &&
|
(! isVowel(posMinusTwo = valueString.charAt(pos-2))) &&
|
hasSubstring(valueString, pos-1, "ACH") &&
|
((posPlusTwo = valueString.charAt(pos+2)) != 'I') &&
|
((posPlusTwo != 'E') ||
|
((valueString.charAt(pos+3) == 'R') &&
|
((posMinusTwo == 'B') || (posMinusTwo == 'M')))))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
|
// Check for a special case of "caesar", which will be maped to 'S'.
|
if ((pos == 0) && hasSubstring(valueString, pos+1, "AESAR"))
|
{
|
metaphone.append("S");
|
pos += 2;
|
break;
|
}
|
|
|
// CH can be treated in lots of different ways.
|
if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
|
{
|
// Check for "chia" as in "chianti" and map to 'K'.
|
if (hasSubstring(valueString, pos+2, "IA"))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
// Check for "chae" as in "michael" and map to 'K'.
|
if (hasSubstring(valueString, pos+2, "AE"))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
// Check for a Greek root at the beginning of the value like
|
// chemistry or chorus and map to 'K'.
|
if ((pos == 0) && (! hasSubstring(valueString, 2, "ORE")) &&
|
(hasSubstring(valueString, 2, "ARAC") ||
|
hasSubstring(valueString, 2, "ARIS") ||
|
hasSubstring(valueString, 2, "OR") ||
|
hasSubstring(valueString, 2, "YM") ||
|
hasSubstring(valueString, 2, "IA") ||
|
hasSubstring(valueString, 2, "EM")))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
|
// Check for "CH" values that produce a "KH" sound that will be
|
// mapped to 'K'.
|
if (isGermanic(valueString) ||
|
hasSubstring(valueString, pos-2, "ORCHES") ||
|
hasSubstring(valueString, pos-2, "ARCHIT") ||
|
hasSubstring(valueString, pos-2, "ORCHID") ||
|
((posPlusTwo = valueString.charAt(pos+2)) == 'T') ||
|
(posPlusTwo == 'S') ||
|
(((pos == 0) ||
|
(((posMinusOne = valueString.charAt(pos-1)) == 'A') ||
|
(posMinusOne == 'O') || (posMinusOne == 'U') ||
|
(posMinusOne == 'E'))) &&
|
((posPlusTwo == 'L') || (posPlusTwo == 'R') ||
|
(posPlusTwo == 'N')|| (posPlusTwo == 'M') ||
|
(posPlusTwo == 'B')|| (posPlusTwo == 'H') ||
|
(posPlusTwo == 'F')|| (posPlusTwo == 'V') ||
|
(posPlusTwo == 'W'))))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
|
// All other "CH" values.
|
if (pos > 0)
|
{
|
if (hasSubstring(valueString, 0, "MC"))
|
{
|
metaphone.append("K");
|
}
|
else
|
{
|
metaphone.append("X");
|
}
|
}
|
else
|
{
|
metaphone.append("X");
|
}
|
|
pos += 2;
|
break;
|
}
|
|
|
// Check for "CZ" as in "czerny" but not "wicz" and map to 'S'.
|
if ((posPlusOne == 'Z') &&
|
(! hasSubstring(valueString, pos-2, "WI")))
|
{
|
metaphone.append("S");
|
pos += 2;
|
break;
|
}
|
|
|
// Check for "CIA" as in "focaccia" and map to 'X'.
|
if ((posPlusOne == 'I') && (valueString.charAt(pos+2) == 'A'))
|
{
|
metaphone.append("X");
|
pos += 3;
|
break;
|
}
|
|
|
// Check for a double C but not in values that start with "McC"
|
if ((posPlusOne == 'C') &&
|
(! ((pos == 1) && valueString.charAt(0) == 'M')))
|
{
|
if ((((posPlusTwo = valueString.charAt(pos+2)) == 'I') ||
|
(posPlusTwo == 'E') || (posPlusTwo == 'H')) &&
|
(! ((posPlusTwo == 'H') && valueString.charAt(pos+3) == 'U')))
|
{
|
if (((pos == 1) && (valueString.charAt(pos-1) == 'A')) ||
|
hasSubstring(valueString, pos-1, "UCCEE") ||
|
hasSubstring(valueString, pos-1, "UCCES"))
|
{
|
// Values like "accident", "accede", and "succeed".
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
else
|
{
|
// Values like "bacci" or "bertucci".
|
metaphone.append("X");
|
pos += 3;
|
break;
|
}
|
}
|
else
|
{
|
// This is Pierce's Rule, whatever that means.
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
}
|
|
|
// Check for CK, CG, or CQ and map to 'K'. Check for CI, CE, and CY
|
// and map to "S".
|
if (((posPlusOne = valueString.charAt(pos+1)) == 'K') ||
|
(posPlusOne == 'G') || (posPlusOne == 'Q'))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
|
// Check for CI, CE, or CY and map to 'S'.
|
if ((posPlusOne == 'I') || (posPlusOne == 'E') || (posPlusOne == 'Y'))
|
{
|
metaphone.append("S");
|
pos += 2;
|
break;
|
}
|
|
|
// All other cases of "C" will be mapped to 'K'. However, the number
|
// of positions that we skip ahead may vary. If there is a value that
|
// consists of two words like "mac caffrey", then skip ahead three.
|
// For the character combinations of "CK" and "CQ", then skip ahead
|
// two. For the character combinations of "CC" except "CCE" and
|
// "CCI", then skip ahead two. For all other cases, skip ahead one.
|
metaphone.append("K");
|
switch (valueString.charAt(pos+1))
|
{
|
case ' ':
|
switch (valueString.charAt(pos+2))
|
{
|
case 'C':
|
case 'Q':
|
case 'G':
|
pos += 3;
|
break;
|
default:
|
pos++;
|
break;
|
}
|
break;
|
|
case 'K':
|
case 'Q':
|
pos += 2;
|
break;
|
|
case 'C':
|
switch (valueString.charAt(pos+2))
|
{
|
case 'E':
|
case 'I':
|
pos++;
|
break;
|
default:
|
pos += 2;
|
break;
|
}
|
break;
|
default:
|
pos++;
|
}
|
break;
|
|
|
case 'D':
|
// DG will be mapped to either 'J' (in cases like edge) or 'TK' (in
|
// cases like Edgar).
|
if ((posPlusOne = valueString.charAt(pos+1)) == 'G')
|
{
|
if (((posPlusTwo = valueString.charAt(pos+2)) == 'I') ||
|
(posPlusTwo == 'E') || (posPlusTwo == 'Y'))
|
{
|
metaphone.append("J");
|
pos += 3;
|
break;
|
}
|
else
|
{
|
metaphone.append("TK");
|
pos += 2;
|
break;
|
}
|
}
|
|
|
// DT and DD will be mapped to 'T'.
|
if ((posPlusOne == 'T') || (posPlusOne == 'D'))
|
{
|
metaphone.append("T");
|
pos += 2;
|
break;
|
}
|
|
|
// All other cases will be mapped to 'T'.
|
metaphone.append("T");
|
pos++;
|
break;
|
|
|
case 'F':
|
// F always maps to F. If there is a double F, then skip the second
|
// one.
|
metaphone.append("F");
|
pos++;
|
if (valueString.charAt(pos) == 'F')
|
{
|
pos++;
|
}
|
break;
|
|
|
case 'G':
|
if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
|
{
|
// A "GH" that is not preceded by a vowel will be mapped to 'K'.
|
if ((pos > 0) && (! isVowel(valueString.charAt(pos-1))))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
if (pos == 0)
|
{
|
if (valueString.charAt(pos+2) == 'I')
|
{
|
// Words like ghislane or ghiradelli
|
metaphone.append("J");
|
}
|
else
|
{
|
metaphone.append("K");
|
}
|
|
pos += 2;
|
break;
|
}
|
|
// A refined version of Parker's Rule.
|
if (((pos > 1) &&
|
(((posMinusTwo = valueString.charAt(pos-2)) == 'B') ||
|
(posMinusTwo == 'H') || (posMinusTwo == 'D'))) ||
|
((pos > 2) &&
|
(((posMinusThree = valueString.charAt(pos-3)) == 'B') ||
|
(posMinusThree == 'H') || (posMinusThree == 'D'))) ||
|
((pos > 3) &&
|
(((posMinusFour = valueString.charAt(pos-4)) == 'B') ||
|
(posMinusFour == 'H'))))
|
{
|
pos += 2;
|
break;
|
}
|
else
|
{
|
if ((pos > 2) && (valueString.charAt(pos-1) == 'U') &&
|
(((posMinusThree = valueString.charAt(pos-3)) == 'C') ||
|
(posMinusThree == 'G') || (posMinusThree == 'L') ||
|
(posMinusThree == 'R') || (posMinusThree == 'T')))
|
{
|
// Words like laugh, McLaughlin, cough, rough are mapped to 'F'.
|
metaphone.append("F");
|
}
|
else if ((pos > 0) && (valueString.charAt(pos-1) != 'I'))
|
{
|
metaphone.append("K");
|
}
|
|
pos += 2;
|
break;
|
}
|
}
|
|
|
if (posPlusOne == 'N')
|
{
|
if ((pos == 1) && isVowel(valueString.charAt(0)) &&
|
(! isSlavoGermanic(valueString)))
|
{
|
metaphone.append("KN");
|
pos += 2;
|
break;
|
}
|
else
|
{
|
if ((! hasSubstring(valueString, pos+2, "EY")) &&
|
(! isSlavoGermanic(valueString)))
|
{
|
metaphone.append("N");
|
}
|
else
|
{
|
metaphone.append("KN");
|
}
|
|
pos += 2;
|
break;
|
}
|
}
|
|
|
// GLI as in tagliaro will be mapped to "KL".
|
if ((posPlusOne == 'L') && (valueString.charAt(pos+2) == 'I'))
|
{
|
metaphone.append("KL");
|
pos += 2;
|
break;
|
}
|
|
|
// Forms of GY, GE, and GI at the beginning of a word will map to 'K'.
|
if ((pos == 0) &&
|
((posPlusOne == 'Y') ||
|
(substring = valueString.substring(pos+1,pos+3)).equals("ES") ||
|
substring.equals("EP") || substring.equals("EB") ||
|
substring.equals("EL") || substring.equals("EY") ||
|
substring.equals("IB") || substring.equals("IL") ||
|
substring.equals("IN") || substring.equals("IE") ||
|
substring.equals("EI") || substring.equals("ER")))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
|
// Some occurrences of GER and GY in a word will be mapped to 'K'.
|
posPlusTwo = valueString.charAt(pos+2);
|
if ((((posPlusOne == 'E') && (posPlusTwo == 'R')) ||
|
(posPlusOne == 'Y')) &&
|
((posMinusOne = valueString.charAt(pos-1)) != 'E') &&
|
(posMinusOne != 'I') &&
|
(! hasSubstring(valueString, 0, "DANGER")) &&
|
(! hasSubstring(valueString, 0, "RANGER")) &&
|
(! hasSubstring(valueString, 0, "MANGER")) &&
|
(! hasSubstring(valueString, pos-1, "RGY")) &&
|
(! hasSubstring(valueString, pos-1, "OGY")))
|
{
|
metaphone.append("K");
|
pos += 2;
|
break;
|
}
|
|
|
// Check for Italian uses like 'biaggi" and map to 'J'.
|
if ((posPlusOne == 'E') || (posPlusOne == 'I') ||
|
(posPlusOne == 'Y') ||
|
hasSubstring(valueString, pos-1, "AGGI") ||
|
hasSubstring(valueString, pos-1, "OGGI"))
|
{
|
// Germanic uses will be mapped to 'K'.
|
if (isGermanic(valueString) ||
|
hasSubstring(valueString, pos+1, "ET"))
|
{
|
metaphone.append("K");
|
}
|
else
|
{
|
metaphone.append("J");
|
}
|
|
pos += 2;
|
break;
|
}
|
|
|
// All other cases will be mapped to 'K'. If there is a double G,
|
// then skip two. Otherwise, just skip one.
|
metaphone.append("K");
|
pos++;
|
|
if (posPlusOne == 'G')
|
{
|
pos++;
|
}
|
|
break;
|
|
|
case 'H':
|
// The letter 'H' will only be processed if it is immediately followed
|
// by a vowel and is either the start of the word or preceded by a
|
// vowel.
|
if (isVowel(valueString.charAt(pos+1)))
|
{
|
if ((pos == 0) || isVowel(valueString.charAt(pos-1)))
|
{
|
metaphone.append("H");
|
pos++;
|
}
|
}
|
|
pos++;
|
break;
|
|
|
case 'J':
|
// Take care of obvious Spanish uses that should map to 'H'.
|
if (hasSubstring(valueString, 0, "SAN "))
|
{
|
metaphone.append("H");
|
pos++;
|
break;
|
}
|
|
if (hasSubstring(valueString, pos, "JOSE"))
|
{
|
if ((pos == 0) && (valueString.charAt(pos+4) == ' '))
|
{
|
metaphone.append("H");
|
}
|
else
|
{
|
metaphone.append("J");
|
}
|
|
pos++;
|
break;
|
}
|
|
|
// All other cases will be mapped to 'J'.
|
metaphone.append("J");
|
|
if (valueString.charAt(pos+1) == 'J')
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'K':
|
// 'K' will always be mapped to 'K'. KK will be treated like K.
|
metaphone.append("K");
|
|
if (valueString.charAt(pos+1) == 'K')
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'L':
|
// 'L' will always be mapped to 'L'. LL will be treated like L, even
|
// for potential Spanish uses.
|
metaphone.append("L");
|
|
if (valueString.charAt(pos+1) == 'L')
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'M':
|
// 'M' will always be mapped to 'M'. MM will be treated like M.
|
// UMB in cases like "dumb" and "thumb" will be treated like M.
|
metaphone.append("M");
|
|
if (valueString.charAt(pos+1) == 'M')
|
{
|
pos++;
|
}
|
else if (hasSubstring(valueString, pos-1, "UMB"))
|
{
|
if (((pos+1) == last) ||
|
hasSubstring(valueString, pos+2, "ER"))
|
{
|
pos++;
|
}
|
}
|
|
pos++;
|
break;
|
|
|
case 'N':
|
// 'N' will always be mapped to 'N'. NN will be treated like N.
|
metaphone.append("N");
|
|
if (valueString.charAt(pos+1) == 'N')
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'P':
|
// PH will be mapped to 'F'.
|
if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
|
{
|
metaphone.append("F");
|
pos += 2;
|
break;
|
}
|
|
|
// All other cases will be mapped to 'P', with PP and PB being treated
|
// like P.
|
metaphone.append("P");
|
|
if ((posPlusOne == 'P') || (posPlusOne == 'B'))
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'Q':
|
// 'Q' will always be mapped to 'K'. QQ will be treated like Q.
|
metaphone.append("K");
|
|
if (valueString.charAt(pos+1) == 'Q')
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'R':
|
// Ignore R at the end of French words.
|
if ((pos == last) && (! isSlavoGermanic(valueString)) &&
|
hasSubstring(valueString, pos-2, "IE") &&
|
(! hasSubstring(valueString, pos-4, "ME")) &&
|
(! hasSubstring(valueString, pos-4, "MA")))
|
{
|
pos++;
|
break;
|
}
|
|
|
// All other cases will be mapped to 'R', with RR treated like R.
|
metaphone.append("R");
|
|
if (valueString.charAt(pos+1) == 'R')
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'S':
|
// Special cases like isle and carlysle will be silent.
|
if (hasSubstring(valueString, pos-1, "ISL") ||
|
hasSubstring(valueString, pos-1, "YSL"))
|
{
|
pos++;
|
break;
|
}
|
|
|
// Special case of sugar mapped to 'X'.
|
if (hasSubstring(valueString, pos+1, "UGAR"))
|
{
|
metaphone.append("X");
|
pos++;
|
break;
|
}
|
|
|
// SH is generally mapped to 'X', but not in Germanic cases.
|
if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
|
{
|
if (hasSubstring(valueString, pos+1, "HEIM") ||
|
hasSubstring(valueString, pos+1, "HOEK") ||
|
hasSubstring(valueString, pos+1, "HOLM") ||
|
hasSubstring(valueString, pos+1, "HOLZ"))
|
{
|
metaphone.append("S");
|
}
|
else
|
{
|
metaphone.append("X");
|
}
|
|
pos += 2;
|
break;
|
}
|
|
|
// Italian and Armenian cases will map to "S".
|
if (hasSubstring(valueString, pos+1, "IO") ||
|
hasSubstring(valueString, pos+1, "IA"))
|
{
|
metaphone.append("S");
|
pos += 3;
|
break;
|
}
|
|
|
// SZ should be mapped to 'S'.
|
if (posPlusOne == 'Z')
|
{
|
metaphone.append("S");
|
pos += 2;
|
break;
|
}
|
|
|
// Various combinations at the beginning of words will be mapped to
|
// 'S'.
|
if ((pos == 0) &&
|
((posPlusOne == 'M') || (posPlusOne == 'N') ||
|
(posPlusOne == 'L') || (posPlusOne == 'W')))
|
{
|
metaphone.append("S");
|
pos++;
|
break;
|
}
|
|
|
// SC should be mapped to either SK, X, or S.
|
if (posPlusOne == 'C')
|
{
|
if ((posPlusTwo = valueString.charAt(pos+2)) == 'H')
|
{
|
if (hasSubstring(valueString, pos+3, "OO") ||
|
hasSubstring(valueString, pos+3, "UY") ||
|
hasSubstring(valueString, pos+3, "ED") ||
|
hasSubstring(valueString, pos+3, "EM"))
|
{
|
metaphone.append("SK");
|
}
|
else
|
{
|
metaphone.append("X");
|
}
|
|
pos += 3;
|
break;
|
}
|
|
if ((posPlusTwo == 'I') || (posPlusTwo == 'E') ||
|
(posPlusTwo == 'Y'))
|
{
|
metaphone.append("S");
|
pos += 3;
|
break;
|
}
|
|
metaphone.append("SK");
|
pos += 3;
|
break;
|
}
|
|
|
// Ignore a trailing S in French words. All others will be mapped to
|
// 'S'.
|
if (! ((pos == last) &&
|
(hasSubstring(valueString, pos-2, "AI") ||
|
hasSubstring(valueString, pos-2, "OI"))))
|
{
|
metaphone.append("S");
|
}
|
|
if ((posPlusOne == 'S') || (posPlusOne == 'Z'))
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'T':
|
// "TION", "TIA", and "TCH" will be mapped to 'X'.
|
if (hasSubstring(valueString, pos, "TION") ||
|
hasSubstring(valueString, pos, "TIA") ||
|
hasSubstring(valueString, pos, "TCH"))
|
{
|
metaphone.append("X");
|
pos += 3;
|
break;
|
}
|
|
|
// TH or TTH will be mapped to either T (for Germanic cases) or
|
// 0 (zero) for the rest.
|
if (((posPlusOne = valueString.charAt(pos+1)) == 'H') ||
|
((posPlusOne == 'T') && (valueString.charAt(pos+2) == 'H')))
|
{
|
if (isGermanic(valueString) ||
|
hasSubstring(valueString, pos+2, "OM") ||
|
hasSubstring(valueString, pos+2, "AM"))
|
{
|
metaphone.append("T");
|
}
|
else
|
{
|
metaphone.append("0");
|
}
|
|
pos += 2;
|
break;
|
}
|
|
|
// All other cases will map to T, with TT and TD being treated like T.
|
metaphone.append("T");
|
|
if ((posPlusOne == 'T') || (posPlusOne == 'D'))
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'V':
|
// 'V' will always be mapped to 'F', with VV treated like V.
|
metaphone.append("F");
|
|
if (valueString.charAt(pos+1) == 'V')
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'W':
|
// WR should always map to R.
|
if ((posPlusOne = valueString.charAt(pos+1)) == 'R')
|
{
|
metaphone.append("R");
|
pos += 2;
|
break;
|
}
|
|
|
// W[AEIOUYH] at the beginning of the word should be mapped to A.
|
if ((pos == 0) && (isVowel(posPlusOne) || (posPlusOne == 'H')))
|
{
|
metaphone.append("A");
|
|
// FIXME -- This isn't in the algorithm as written. Should it be?
|
pos += 2;
|
break;
|
}
|
|
|
// A Polish value like WICZ or WITZ should be mapped to TS.
|
if (hasSubstring(valueString, pos+1, "WICZ") ||
|
hasSubstring(valueString, pos+1, "WITZ"))
|
{
|
metaphone.append("TS");
|
pos += 4;
|
break;
|
}
|
|
|
// Otherwise, we'll just skip it.
|
pos++;
|
break;
|
|
|
case 'X':
|
// X maps to KS except at the end of French words.
|
if (! ((pos == last) &&
|
(hasSubstring(valueString, pos-3, "IAU") ||
|
hasSubstring(valueString, pos-3, "EAU") ||
|
hasSubstring(valueString, pos-2, "AU") ||
|
hasSubstring(valueString, pos-2, "OU"))))
|
{
|
metaphone.append("KS");
|
}
|
|
if (((posPlusOne = valueString.charAt(pos+1)) == 'C') ||
|
(posPlusOne == 'X'))
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case 'Z':
|
// Chinese usages like zhao will map to J.
|
if ((posPlusOne = valueString.charAt(pos+1)) == 'H')
|
{
|
metaphone.append("J");
|
pos += 2;
|
break;
|
}
|
|
|
// All other cases map to "S". ZZ will be treated like Z.
|
metaphone.append("S");
|
|
if (posPlusOne == 'Z')
|
{
|
pos++;
|
}
|
|
pos++;
|
break;
|
|
|
case '\u00C7': // C with a cedilla
|
// This will always be mapped to 'S'.
|
metaphone.append("S");
|
pos++;
|
break;
|
|
|
case '\u00D1': // N with a tilde
|
// This will always be mapped to 'N'.
|
metaphone.append("N");
|
pos++;
|
break;
|
|
|
default:
|
// We don't have any special treatment for this character, so skip it.
|
pos++;
|
break;
|
}
|
}
|
|
|
return new ASN1OctetString(metaphone.toString());
|
}
|
|
|
|
/**
|
* Indicates whether the two provided normalized values are approximately
|
* equal to each other.
|
*
|
* @param value1 The normalized form of the first value to compare.
|
* @param value2 The normalized form of the second value to compare.
|
*
|
* @return <CODE>true</CODE> if the provided values are approximately equal,
|
* or <CODE>false</CODE> if not.
|
*/
|
public boolean approximatelyMatch(ByteString value1, ByteString value2)
|
{
|
// If the values have been normalized, then we just need to compare their
|
// byte arrays.
|
return Arrays.equals(value1.value(), value2.value());
|
}
|
|
|
|
/**
|
* Indicates whether the provided value has the given substring at the
|
* specified position.
|
*
|
* @param value The value containing the range for which to make the
|
* determination.
|
* @param start The position in the value at which to start the
|
* comparison.
|
* @param substring The substring to compare against the specified value
|
* range.
|
*
|
* @return <CODE>true</CODE> if the specified portion of the value matches
|
* the given substring, or <CODE>false</CODE> if it does not.
|
*/
|
private boolean hasSubstring(String value, int start,
|
String substring)
|
{
|
try
|
{
|
// This can happen since a lot of the rules "look behind" and
|
// rightfully don't check if it's the first character
|
if (start < 0) {
|
return false;
|
}
|
|
int end = start + substring.length();
|
|
// value isn't big enough to do the comparison
|
if (end > value.length())
|
{
|
return false;
|
}
|
|
for (int i=0,pos=start; pos < end; i++,pos++)
|
{
|
if (value.charAt(pos) != substring.charAt(i))
|
{
|
return false;
|
}
|
}
|
|
return true;
|
}
|
catch (Exception e)
|
{
|
if (debugEnabled())
|
{
|
TRACER.debugCaught(DebugLogLevel.ERROR, e);
|
}
|
|
return false;
|
}
|
}
|
|
|
|
/**
|
* Indicates whether the provided character is a vowel (including "Y").
|
*
|
* @param c The character for which to make the determination.
|
*
|
* @return <CODE>true</CODE> if the provided character is a vowel, or
|
* <CODE>false</CODE> if not.
|
*/
|
private boolean isVowel(char c)
|
{
|
switch (c)
|
{
|
case 'A':
|
case 'E':
|
case 'I':
|
case 'O':
|
case 'U':
|
case 'Y':
|
return true;
|
|
default:
|
return false;
|
}
|
}
|
|
|
|
/**
|
* Indicates whether the provided string appears to be Slavo-Germanic.
|
*
|
* @param s The string for which to make the determination.
|
*
|
* @return <CODE>true</CODE> if the provided string appears to be
|
* Slavo-Germanic, or <CODE>false</CODE> if not.
|
*/
|
private boolean isSlavoGermanic(String s)
|
{
|
return (s.contains("W") || s.contains("K") || s.contains("CZ") ||
|
s.contains("WITZ"));
|
}
|
|
|
|
/**
|
* Indicates whether the provided string appears Germanic (starts with "VAN ",
|
* "VON ", or "SCH").
|
*
|
* @param s The string for which to make the determination.
|
*
|
* @return <CODE>true</CODE> if the provided string appears Germanic, or
|
* <CODE>false</CODE> if not.
|
*/
|
private boolean isGermanic(String s)
|
{
|
return (s.startsWith("VAN ") || s.startsWith("VON ") ||
|
s.startsWith("SCH"));
|
}
|
}
|