1/*
2 * Copyright 2001-2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package org.apache.commons.codec.language;
18
19import org.apache.commons.codec.EncoderException;
20import org.apache.commons.codec.StringEncoder;
21
22/**
23 * Encodes a string into a metaphone value.
24 * <p>
25 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
26 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
27 * </p>
28 * <p>
29 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
30 * 39.</CITE>
31 * </p>
32 *
33 * @author Apache Software Foundation
34 * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
35 *
36 * @deprecated Please use {@link java.net.URL#openConnection} instead.
37 *     Please visit <a href="http://android-developers.blogspot.com/2011/09/androids-http-clients.html">this webpage</a>
38 *     for further details.
39 */
40@Deprecated
41public class Metaphone implements StringEncoder {
42
43    /**
44     * Five values in the English language
45     */
46    private String vowels = "AEIOU" ;
47
48    /**
49     * Variable used in Metaphone algorithm
50     */
51    private String frontv = "EIY"   ;
52
53    /**
54     * Variable used in Metaphone algorithm
55     */
56    private String varson = "CSPTG" ;
57
58    /**
59     * The max code length for metaphone is 4
60     */
61    private int maxCodeLen = 4 ;
62
63    /**
64     * Creates an instance of the Metaphone encoder
65     */
66    public Metaphone() {
67        super();
68    }
69
70    /**
71     * Find the metaphone value of a String. This is similar to the
72     * soundex algorithm, but better at finding similar sounding words.
73     * All input is converted to upper case.
74     * Limitations: Input format is expected to be a single ASCII word
75     * with only characters in the A - Z range, no punctuation or numbers.
76     *
77     * @param txt String to find the metaphone code for
78     * @return A metaphone code corresponding to the String supplied
79     */
80    public String metaphone(String txt) {
81        boolean hard = false ;
82        if ((txt == null) || (txt.length() == 0)) {
83            return "" ;
84        }
85        // single character is itself
86        if (txt.length() == 1) {
87            return txt.toUpperCase() ;
88        }
89
90        char[] inwd = txt.toUpperCase().toCharArray() ;
91
92        StringBuffer local = new StringBuffer(40); // manipulate
93        StringBuffer code = new StringBuffer(10) ; //   output
94        // handle initial 2 characters exceptions
95        switch(inwd[0]) {
96        case 'K' :
97        case 'G' :
98        case 'P' : /* looking for KN, etc*/
99            if (inwd[1] == 'N') {
100                local.append(inwd, 1, inwd.length - 1);
101            } else {
102                local.append(inwd);
103            }
104            break;
105        case 'A': /* looking for AE */
106            if (inwd[1] == 'E') {
107                local.append(inwd, 1, inwd.length - 1);
108            } else {
109                local.append(inwd);
110            }
111            break;
112        case 'W' : /* looking for WR or WH */
113            if (inwd[1] == 'R') {   // WR -> R
114                local.append(inwd, 1, inwd.length - 1);
115                break ;
116            }
117            if (inwd[1] == 'H') {
118                local.append(inwd, 1, inwd.length - 1);
119                local.setCharAt(0, 'W'); // WH -> W
120            } else {
121                local.append(inwd);
122            }
123            break;
124        case 'X' : /* initial X becomes S */
125            inwd[0] = 'S';
126            local.append(inwd);
127            break ;
128        default :
129            local.append(inwd);
130        } // now local has working string with initials fixed
131
132        int wdsz = local.length();
133        int n = 0 ;
134
135        while ((code.length() < this.getMaxCodeLen()) &&
136               (n < wdsz) ) { // max code size of 4 works well
137            char symb = local.charAt(n) ;
138            // remove duplicate letters except C
139            if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
140                n++ ;
141            } else { // not dup
142                switch(symb) {
143                case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
144                    if (n == 0) {
145                        code.append(symb);
146                    }
147                    break ; // only use vowel if leading char
148                case 'B' :
149                    if ( isPreviousChar(local, n, 'M') &&
150                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
151                        break;
152                    }
153                    code.append(symb);
154                    break;
155                case 'C' : // lots of C special cases
156                    /* discard if SCI, SCE or SCY */
157                    if ( isPreviousChar(local, n, 'S') &&
158                         !isLastChar(wdsz, n) &&
159                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
160                        break;
161                    }
162                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
163                        code.append('X');
164                        break;
165                    }
166                    if (!isLastChar(wdsz, n) &&
167                        (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
168                        code.append('S');
169                        break; // CI,CE,CY -> S
170                    }
171                    if (isPreviousChar(local, n, 'S') &&
172                        isNextChar(local, n, 'H') ) { // SCH->sk
173                        code.append('K') ;
174                        break ;
175                    }
176                    if (isNextChar(local, n, 'H')) { // detect CH
177                        if ((n == 0) &&
178                            (wdsz >= 3) &&
179                            isVowel(local,2) ) { // CH consonant -> K consonant
180                            code.append('K');
181                        } else {
182                            code.append('X'); // CHvowel -> X
183                        }
184                    } else {
185                        code.append('K');
186                    }
187                    break ;
188                case 'D' :
189                    if (!isLastChar(wdsz, n + 1) &&
190                        isNextChar(local, n, 'G') &&
191                        (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
192                        code.append('J'); n += 2 ;
193                    } else {
194                        code.append('T');
195                    }
196                    break ;
197                case 'G' : // GH silent at end or before consonant
198                    if (isLastChar(wdsz, n + 1) &&
199                        isNextChar(local, n, 'H')) {
200                        break;
201                    }
202                    if (!isLastChar(wdsz, n + 1) &&
203                        isNextChar(local,n,'H') &&
204                        !isVowel(local,n+2)) {
205                        break;
206                    }
207                    if ((n > 0) &&
208                        ( regionMatch(local, n, "GN") ||
209                          regionMatch(local, n, "GNED") ) ) {
210                        break; // silent G
211                    }
212                    if (isPreviousChar(local, n, 'G')) {
213                        hard = true ;
214                    } else {
215                        hard = false ;
216                    }
217                    if (!isLastChar(wdsz, n) &&
218                        (this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
219                        (!hard)) {
220                        code.append('J');
221                    } else {
222                        code.append('K');
223                    }
224                    break ;
225                case 'H':
226                    if (isLastChar(wdsz, n)) {
227                        break ; // terminal H
228                    }
229                    if ((n > 0) &&
230                        (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
231                        break;
232                    }
233                    if (isVowel(local,n+1)) {
234                        code.append('H'); // Hvowel
235                    }
236                    break;
237                case 'F':
238                case 'J' :
239                case 'L' :
240                case 'M':
241                case 'N' :
242                case 'R' :
243                    code.append(symb);
244                    break;
245                case 'K' :
246                    if (n > 0) { // not initial
247                        if (!isPreviousChar(local, n, 'C')) {
248                            code.append(symb);
249                        }
250                    } else {
251                        code.append(symb); // initial K
252                    }
253                    break ;
254                case 'P' :
255                    if (isNextChar(local,n,'H')) {
256                        // PH -> F
257                        code.append('F');
258                    } else {
259                        code.append(symb);
260                    }
261                    break ;
262                case 'Q' :
263                    code.append('K');
264                    break;
265                case 'S' :
266                    if (regionMatch(local,n,"SH") ||
267                        regionMatch(local,n,"SIO") ||
268                        regionMatch(local,n,"SIA")) {
269                        code.append('X');
270                    } else {
271                        code.append('S');
272                    }
273                    break;
274                case 'T' :
275                    if (regionMatch(local,n,"TIA") ||
276                        regionMatch(local,n,"TIO")) {
277                        code.append('X');
278                        break;
279                    }
280                    if (regionMatch(local,n,"TCH")) {
281                        // Silent if in "TCH"
282                        break;
283                    }
284                    // substitute numeral 0 for TH (resembles theta after all)
285                    if (regionMatch(local,n,"TH")) {
286                        code.append('0');
287                    } else {
288                        code.append('T');
289                    }
290                    break ;
291                case 'V' :
292                    code.append('F'); break ;
293                case 'W' : case 'Y' : // silent if not followed by vowel
294                    if (!isLastChar(wdsz,n) &&
295                        isVowel(local,n+1)) {
296                        code.append(symb);
297                    }
298                    break ;
299                case 'X' :
300                    code.append('K'); code.append('S');
301                    break ;
302                case 'Z' :
303                    code.append('S'); break ;
304                } // end switch
305                n++ ;
306            } // end else from symb != 'C'
307            if (code.length() > this.getMaxCodeLen()) {
308                code.setLength(this.getMaxCodeLen());
309            }
310        }
311        return code.toString();
312    }
313
314    private boolean isVowel(StringBuffer string, int index) {
315        return (this.vowels.indexOf(string.charAt(index)) >= 0);
316    }
317
318    private boolean isPreviousChar(StringBuffer string, int index, char c) {
319        boolean matches = false;
320        if( index > 0 &&
321            index < string.length() ) {
322            matches = string.charAt(index - 1) == c;
323        }
324        return matches;
325    }
326
327    private boolean isNextChar(StringBuffer string, int index, char c) {
328        boolean matches = false;
329        if( index >= 0 &&
330            index < string.length() - 1 ) {
331            matches = string.charAt(index + 1) == c;
332        }
333        return matches;
334    }
335
336    private boolean regionMatch(StringBuffer string, int index, String test) {
337        boolean matches = false;
338        if( index >= 0 &&
339            (index + test.length() - 1) < string.length() ) {
340            String substring = string.substring( index, index + test.length());
341            matches = substring.equals( test );
342        }
343        return matches;
344    }
345
346    private boolean isLastChar(int wdsz, int n) {
347        return n + 1 == wdsz;
348    }
349
350
351    /**
352     * Encodes an Object using the metaphone algorithm.  This method
353     * is provided in order to satisfy the requirements of the
354     * Encoder interface, and will throw an EncoderException if the
355     * supplied object is not of type java.lang.String.
356     *
357     * @param pObject Object to encode
358     * @return An object (or type java.lang.String) containing the
359     *         metaphone code which corresponds to the String supplied.
360     * @throws EncoderException if the parameter supplied is not
361     *                          of type java.lang.String
362     */
363    public Object encode(Object pObject) throws EncoderException {
364        if (!(pObject instanceof java.lang.String)) {
365            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
366        }
367        return metaphone((String) pObject);
368    }
369
370    /**
371     * Encodes a String using the Metaphone algorithm.
372     *
373     * @param pString String object to encode
374     * @return The metaphone code corresponding to the String supplied
375     */
376    public String encode(String pString) {
377        return metaphone(pString);
378    }
379
380    /**
381     * Tests is the metaphones of two strings are identical.
382     *
383     * @param str1 First of two strings to compare
384     * @param str2 Second of two strings to compare
385     * @return true if the metaphones of these strings are identical,
386     *         false otherwise.
387     */
388    public boolean isMetaphoneEqual(String str1, String str2) {
389        return metaphone(str1).equals(metaphone(str2));
390    }
391
392    /**
393     * Returns the maxCodeLen.
394     * @return int
395     */
396    public int getMaxCodeLen() { return this.maxCodeLen; }
397
398    /**
399     * Sets the maxCodeLen.
400     * @param maxCodeLen The maxCodeLen to set
401     */
402    public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
403
404}
405