1069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project/*
2069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Copyright 2001-2004 The Apache Software Foundation.
3069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project *
4069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Licensed under the Apache License, Version 2.0 (the "License");
5069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * you may not use this file except in compliance with the License.
6069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * You may obtain a copy of the License at
7069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project *
8069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project *      http://www.apache.org/licenses/LICENSE-2.0
9069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project *
10069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Unless required by applicable law or agreed to in writing, software
11069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * distributed under the License is distributed on an "AS IS" BASIS,
12069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * See the License for the specific language governing permissions and
14069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * limitations under the License.
15069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project */
16069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
17069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectpackage org.apache.commons.codec.language;
18069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
19069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectimport org.apache.commons.codec.EncoderException;
20069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectimport org.apache.commons.codec.StringEncoder;
21069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
22069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project/**
23069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * Encodes a string into a Refined Soundex value. A refined soundex code is
24069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * optimized for spell checking words. Soundex method originally developed by
25069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
26069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project *
27069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * @author Apache Software Foundation
28069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project * @version $Id: RefinedSoundex.java,v 1.21 2004/06/05 18:32:04 ggregory Exp $
29069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project */
30069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Projectpublic class RefinedSoundex implements StringEncoder {
31069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
32069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
33069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * This static variable contains an instance of the RefinedSoundex using
34069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * the US_ENGLISH mapping.
35069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
36069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
37069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
38069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
39069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * RefinedSoundex is *refined* for a number of reasons one being that the
40069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * mappings have been altered. This implementation contains default
41069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * mappings for US English.
42069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
43069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    public static final char[] US_ENGLISH_MAPPING = "01360240043788015936020505".toCharArray();
44069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
45069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
46069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Every letter of the alphabet is "mapped" to a numerical value. This char
47069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * array holds the values to which each letter is mapped. This
48069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * implementation contains a default map for US_ENGLISH
49069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
50069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    private char[] soundexMapping;
51069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
52069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
53069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Creates an instance of the RefinedSoundex object using the default US
54069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * English mapping.
55069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
56069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    public RefinedSoundex() {
57069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        this(US_ENGLISH_MAPPING);
58069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    }
59069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
60069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
61069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Creates a refined soundex instance using a custom mapping. This
62069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * constructor can be used to customize the mapping, and/or possibly
63069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * provide an internationalized mapping for a non-Western character set.
64069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *
65069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @param mapping
66069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  Mapping array to use when finding the corresponding code for
67069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  a given character
68069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
69069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    public RefinedSoundex(char[] mapping) {
70069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        this.soundexMapping = mapping;
71069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    }
72069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
73069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    // BEGIN android-note
74069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    // Removed @see reference to SoundexUtils below, since the class isn't
75069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    // public.
76069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    // END android-note
77069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
78069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Returns the number of characters in the two encoded Strings that are the
79069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * same. This return value ranges from 0 to the length of the shortest
80069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
81069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * example) indicates strong similarity or identical values. For refined
82069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Soundex, the return value can be greater than 4.
83069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *
84069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @param s1
85069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  A String that will be encoded and compared.
86069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @param s2
87069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  A String that will be encoded and compared.
88069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @return The number of characters in the two encoded Strings that are the
89069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *             same from 0 to to the length of the shortest encoded String.
90069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *
91069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
92069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *          MS T-SQL DIFFERENCE</a>
93069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *
94069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @throws EncoderException
95069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  if an error occurs encoding one of the strings
96069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @since 1.3
97069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
98069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    public int difference(String s1, String s2) throws EncoderException {
99069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        return SoundexUtils.difference(this, s1, s2);
100069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    }
101069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
102069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
103069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Encodes an Object using the refined soundex algorithm. This method is
104069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * provided in order to satisfy the requirements of the Encoder interface,
105069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * and will throw an EncoderException if the supplied object is not of type
106069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * java.lang.String.
107069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *
108069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @param pObject
109069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  Object to encode
110069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @return An object (or type java.lang.String) containing the refined
111069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *             soundex code which corresponds to the String supplied.
112069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @throws EncoderException
113069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  if the parameter supplied is not of type java.lang.String
114069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
115069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    public Object encode(Object pObject) throws EncoderException {
116069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        if (!(pObject instanceof java.lang.String)) {
117069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
118069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        }
119069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        return soundex((String) pObject);
120069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    }
121069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
122069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
123069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Encodes a String using the refined soundex algorithm.
124069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *
125069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @param pString
126069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  A String object to encode
127069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @return A Soundex code corresponding to the String supplied
128069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
129069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    public String encode(String pString) {
130069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        return soundex(pString);
131069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    }
132069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
133069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
134069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Returns the mapping code for a given character. The mapping codes are
135069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * maintained in an internal char array named soundexMapping, and the
136069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * default values of these mappings are US English.
137069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *
138069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @param c
139069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  char to get mapping for
140069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @return A character (really a numeral) to return for the given char
141069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
142069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    char getMappingCode(char c) {
143069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        if (!Character.isLetter(c)) {
144069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            return 0;
145069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        }
146069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        return this.soundexMapping[Character.toUpperCase(c) - 'A'];
147069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    }
148069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
149069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    /**
150069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * Retreives the Refined Soundex code for a given String object.
151069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *
152069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @param str
153069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     *                  String to encode using the Refined Soundex algorithm
154069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     * @return A soundex code for the String supplied
155069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project     */
156069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    public String soundex(String str) {
157069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        if (str == null) {
158069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            return null;
159069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        }
160069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        str = SoundexUtils.clean(str);
161069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        if (str.length() == 0) {
162069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            return str;
163069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        }
164069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
165069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        StringBuffer sBuf = new StringBuffer();
166069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        sBuf.append(str.charAt(0));
167069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
168069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        char last, current;
169069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        last = '*';
170069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
171069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        for (int i = 0; i < str.length(); i++) {
172069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
173069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            current = getMappingCode(str.charAt(i));
174069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            if (current == last) {
175069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project                continue;
176069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            } else if (current != 0) {
177069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project                sBuf.append(current);
178069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            }
179069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
180069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project            last = current;
181069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
182069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        }
183069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project
184069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project        return sBuf.toString();
185069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project    }
186069490a5ca2fd1988d29daf45d892f47ad665115The Android Open Source Project}
187