1/*
2 * Copyright 2001-2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package org.apache.commons.codec.language;
18
19import org.apache.commons.codec.EncoderException;
20import org.apache.commons.codec.StringEncoder;
21
22/**
23 * Encodes a string into a metaphone value.
24 * <p>
25 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
26 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
27 * </p>
28 * <p>
29 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
30 * 39.</CITE>
31 * </p>
32 *
33 * @author Apache Software Foundation
34 * @version $Id: Metaphone.java,v 1.20 2004/06/05 18:32:04 ggregory Exp $
35 */
36public class Metaphone implements StringEncoder {
37
38    /**
39     * Five values in the English language
40     */
41    private String vowels = "AEIOU" ;
42
43    /**
44     * Variable used in Metaphone algorithm
45     */
46    private String frontv = "EIY"   ;
47
48    /**
49     * Variable used in Metaphone algorithm
50     */
51    private String varson = "CSPTG" ;
52
53    /**
54     * The max code length for metaphone is 4
55     */
56    private int maxCodeLen = 4 ;
57
58    /**
59     * Creates an instance of the Metaphone encoder
60     */
61    public Metaphone() {
62        super();
63    }
64
65    /**
66     * Find the metaphone value of a String. This is similar to the
67     * soundex algorithm, but better at finding similar sounding words.
68     * All input is converted to upper case.
69     * Limitations: Input format is expected to be a single ASCII word
70     * with only characters in the A - Z range, no punctuation or numbers.
71     *
72     * @param txt String to find the metaphone code for
73     * @return A metaphone code corresponding to the String supplied
74     */
75    public String metaphone(String txt) {
76        boolean hard = false ;
77        if ((txt == null) || (txt.length() == 0)) {
78            return "" ;
79        }
80        // single character is itself
81        if (txt.length() == 1) {
82            return txt.toUpperCase() ;
83        }
84
85        char[] inwd = txt.toUpperCase().toCharArray() ;
86
87        StringBuffer local = new StringBuffer(40); // manipulate
88        StringBuffer code = new StringBuffer(10) ; //   output
89        // handle initial 2 characters exceptions
90        switch(inwd[0]) {
91        case 'K' :
92        case 'G' :
93        case 'P' : /* looking for KN, etc*/
94            if (inwd[1] == 'N') {
95                local.append(inwd, 1, inwd.length - 1);
96            } else {
97                local.append(inwd);
98            }
99            break;
100        case 'A': /* looking for AE */
101            if (inwd[1] == 'E') {
102                local.append(inwd, 1, inwd.length - 1);
103            } else {
104                local.append(inwd);
105            }
106            break;
107        case 'W' : /* looking for WR or WH */
108            if (inwd[1] == 'R') {   // WR -> R
109                local.append(inwd, 1, inwd.length - 1);
110                break ;
111            }
112            if (inwd[1] == 'H') {
113                local.append(inwd, 1, inwd.length - 1);
114                local.setCharAt(0, 'W'); // WH -> W
115            } else {
116                local.append(inwd);
117            }
118            break;
119        case 'X' : /* initial X becomes S */
120            inwd[0] = 'S';
121            local.append(inwd);
122            break ;
123        default :
124            local.append(inwd);
125        } // now local has working string with initials fixed
126
127        int wdsz = local.length();
128        int n = 0 ;
129
130        while ((code.length() < this.getMaxCodeLen()) &&
131               (n < wdsz) ) { // max code size of 4 works well
132            char symb = local.charAt(n) ;
133            // remove duplicate letters except C
134            if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
135                n++ ;
136            } else { // not dup
137                switch(symb) {
138                case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
139                    if (n == 0) {
140                        code.append(symb);
141                    }
142                    break ; // only use vowel if leading char
143                case 'B' :
144                    if ( isPreviousChar(local, n, 'M') &&
145                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
146                        break;
147                    }
148                    code.append(symb);
149                    break;
150                case 'C' : // lots of C special cases
151                    /* discard if SCI, SCE or SCY */
152                    if ( isPreviousChar(local, n, 'S') &&
153                         !isLastChar(wdsz, n) &&
154                         (this.frontv.indexOf(local.charAt(n + 1)) >= 0) ) {
155                        break;
156                    }
157                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
158                        code.append('X');
159                        break;
160                    }
161                    if (!isLastChar(wdsz, n) &&
162                        (this.frontv.indexOf(local.charAt(n + 1)) >= 0)) {
163                        code.append('S');
164                        break; // CI,CE,CY -> S
165                    }
166                    if (isPreviousChar(local, n, 'S') &&
167                        isNextChar(local, n, 'H') ) { // SCH->sk
168                        code.append('K') ;
169                        break ;
170                    }
171                    if (isNextChar(local, n, 'H')) { // detect CH
172                        if ((n == 0) &&
173                            (wdsz >= 3) &&
174                            isVowel(local,2) ) { // CH consonant -> K consonant
175                            code.append('K');
176                        } else {
177                            code.append('X'); // CHvowel -> X
178                        }
179                    } else {
180                        code.append('K');
181                    }
182                    break ;
183                case 'D' :
184                    if (!isLastChar(wdsz, n + 1) &&
185                        isNextChar(local, n, 'G') &&
186                        (this.frontv.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J
187                        code.append('J'); n += 2 ;
188                    } else {
189                        code.append('T');
190                    }
191                    break ;
192                case 'G' : // GH silent at end or before consonant
193                    if (isLastChar(wdsz, n + 1) &&
194                        isNextChar(local, n, 'H')) {
195                        break;
196                    }
197                    if (!isLastChar(wdsz, n + 1) &&
198                        isNextChar(local,n,'H') &&
199                        !isVowel(local,n+2)) {
200                        break;
201                    }
202                    if ((n > 0) &&
203                        ( regionMatch(local, n, "GN") ||
204                          regionMatch(local, n, "GNED") ) ) {
205                        break; // silent G
206                    }
207                    if (isPreviousChar(local, n, 'G')) {
208                        hard = true ;
209                    } else {
210                        hard = false ;
211                    }
212                    if (!isLastChar(wdsz, n) &&
213                        (this.frontv.indexOf(local.charAt(n + 1)) >= 0) &&
214                        (!hard)) {
215                        code.append('J');
216                    } else {
217                        code.append('K');
218                    }
219                    break ;
220                case 'H':
221                    if (isLastChar(wdsz, n)) {
222                        break ; // terminal H
223                    }
224                    if ((n > 0) &&
225                        (this.varson.indexOf(local.charAt(n - 1)) >= 0)) {
226                        break;
227                    }
228                    if (isVowel(local,n+1)) {
229                        code.append('H'); // Hvowel
230                    }
231                    break;
232                case 'F':
233                case 'J' :
234                case 'L' :
235                case 'M':
236                case 'N' :
237                case 'R' :
238                    code.append(symb);
239                    break;
240                case 'K' :
241                    if (n > 0) { // not initial
242                        if (!isPreviousChar(local, n, 'C')) {
243                            code.append(symb);
244                        }
245                    } else {
246                        code.append(symb); // initial K
247                    }
248                    break ;
249                case 'P' :
250                    if (isNextChar(local,n,'H')) {
251                        // PH -> F
252                        code.append('F');
253                    } else {
254                        code.append(symb);
255                    }
256                    break ;
257                case 'Q' :
258                    code.append('K');
259                    break;
260                case 'S' :
261                    if (regionMatch(local,n,"SH") ||
262                        regionMatch(local,n,"SIO") ||
263                        regionMatch(local,n,"SIA")) {
264                        code.append('X');
265                    } else {
266                        code.append('S');
267                    }
268                    break;
269                case 'T' :
270                    if (regionMatch(local,n,"TIA") ||
271                        regionMatch(local,n,"TIO")) {
272                        code.append('X');
273                        break;
274                    }
275                    if (regionMatch(local,n,"TCH")) {
276                        // Silent if in "TCH"
277                        break;
278                    }
279                    // substitute numeral 0 for TH (resembles theta after all)
280                    if (regionMatch(local,n,"TH")) {
281                        code.append('0');
282                    } else {
283                        code.append('T');
284                    }
285                    break ;
286                case 'V' :
287                    code.append('F'); break ;
288                case 'W' : case 'Y' : // silent if not followed by vowel
289                    if (!isLastChar(wdsz,n) &&
290                        isVowel(local,n+1)) {
291                        code.append(symb);
292                    }
293                    break ;
294                case 'X' :
295                    code.append('K'); code.append('S');
296                    break ;
297                case 'Z' :
298                    code.append('S'); break ;
299                } // end switch
300                n++ ;
301            } // end else from symb != 'C'
302            if (code.length() > this.getMaxCodeLen()) {
303                code.setLength(this.getMaxCodeLen());
304            }
305        }
306        return code.toString();
307    }
308
309    private boolean isVowel(StringBuffer string, int index) {
310        return (this.vowels.indexOf(string.charAt(index)) >= 0);
311    }
312
313    private boolean isPreviousChar(StringBuffer string, int index, char c) {
314        boolean matches = false;
315        if( index > 0 &&
316            index < string.length() ) {
317            matches = string.charAt(index - 1) == c;
318        }
319        return matches;
320    }
321
322    private boolean isNextChar(StringBuffer string, int index, char c) {
323        boolean matches = false;
324        if( index >= 0 &&
325            index < string.length() - 1 ) {
326            matches = string.charAt(index + 1) == c;
327        }
328        return matches;
329    }
330
331    private boolean regionMatch(StringBuffer string, int index, String test) {
332        boolean matches = false;
333        if( index >= 0 &&
334            (index + test.length() - 1) < string.length() ) {
335            String substring = string.substring( index, index + test.length());
336            matches = substring.equals( test );
337        }
338        return matches;
339    }
340
341    private boolean isLastChar(int wdsz, int n) {
342        return n + 1 == wdsz;
343    }
344
345
346    /**
347     * Encodes an Object using the metaphone algorithm.  This method
348     * is provided in order to satisfy the requirements of the
349     * Encoder interface, and will throw an EncoderException if the
350     * supplied object is not of type java.lang.String.
351     *
352     * @param pObject Object to encode
353     * @return An object (or type java.lang.String) containing the
354     *         metaphone code which corresponds to the String supplied.
355     * @throws EncoderException if the parameter supplied is not
356     *                          of type java.lang.String
357     */
358    public Object encode(Object pObject) throws EncoderException {
359        if (!(pObject instanceof java.lang.String)) {
360            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
361        }
362        return metaphone((String) pObject);
363    }
364
365    /**
366     * Encodes a String using the Metaphone algorithm.
367     *
368     * @param pString String object to encode
369     * @return The metaphone code corresponding to the String supplied
370     */
371    public String encode(String pString) {
372        return metaphone(pString);
373    }
374
375    /**
376     * Tests is the metaphones of two strings are identical.
377     *
378     * @param str1 First of two strings to compare
379     * @param str2 Second of two strings to compare
380     * @return true if the metaphones of these strings are identical,
381     *         false otherwise.
382     */
383    public boolean isMetaphoneEqual(String str1, String str2) {
384        return metaphone(str1).equals(metaphone(str2));
385    }
386
387    /**
388     * Returns the maxCodeLen.
389     * @return int
390     */
391    public int getMaxCodeLen() { return this.maxCodeLen; }
392
393    /**
394     * Sets the maxCodeLen.
395     * @param maxCodeLen The maxCodeLen to set
396     */
397    public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
398
399}
400