1/*
2 * Copyright 2001-2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package org.apache.commons.codec.language;
18
19import org.apache.commons.codec.EncoderException;
20import org.apache.commons.codec.StringEncoder;
21
22/**
23 * Encodes a string into a double metaphone value.
24 * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>.
25 * <ul>
26 * <li>Original Article: <a
27 * href="http://www.cuj.com/documents/s=8038/cuj0006philips/">
28 * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li>
29 * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip">
30 * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li>
31 * </ul>
32 *
33 * @author Apache Software Foundation
34 * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $
35 */
36public class DoubleMetaphone implements StringEncoder {
37
38    /**
39     * "Vowels" to test for
40     */
41    private static final String VOWELS = "AEIOUY";
42
43    /**
44     * Prefixes when present which are not pronounced
45     */
46    private static final String[] SILENT_START =
47    { "GN", "KN", "PN", "WR", "PS" };
48    private static final String[] L_R_N_M_B_H_F_V_W_SPACE =
49    { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
50    private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
51    { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" };
52    private static final String[] L_T_K_S_N_M_B_Z =
53    { "L", "T", "K", "S", "N", "M", "B", "Z" };
54
55    /**
56     * Maximum length of an encoding, default is 4
57     */
58    protected int maxCodeLen = 4;
59
60    /**
61     * Creates an instance of this DoubleMetaphone encoder
62     */
63    public DoubleMetaphone() {
64        super();
65    }
66
67    /**
68     * Encode a value with Double Metaphone
69     *
70     * @param value String to encode
71     * @return an encoded string
72     */
73    public String doubleMetaphone(String value) {
74        return doubleMetaphone(value, false);
75    }
76
77    /**
78     * Encode a value with Double Metaphone, optionally using the alternate
79     * encoding.
80     *
81     * @param value String to encode
82     * @param alternate use alternate encode
83     * @return an encoded string
84     */
85    public String doubleMetaphone(String value, boolean alternate) {
86        value = cleanInput(value);
87        if (value == null) {
88            return null;
89        }
90
91        boolean slavoGermanic = isSlavoGermanic(value);
92        int index = isSilentStart(value) ? 1 : 0;
93
94        DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen());
95
96        while (!result.isComplete() && index <= value.length() - 1) {
97            switch (value.charAt(index)) {
98            case 'A':
99            case 'E':
100            case 'I':
101            case 'O':
102            case 'U':
103            case 'Y':
104                index = handleAEIOUY(value, result, index);
105                break;
106            case 'B':
107                result.append('P');
108                index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1;
109                break;
110            case '\u00C7':
111                // A C with a Cedilla
112                result.append('S');
113                index++;
114                break;
115            case 'C':
116                index = handleC(value, result, index);
117                break;
118            case 'D':
119                index = handleD(value, result, index);
120                break;
121            case 'F':
122                result.append('F');
123                index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1;
124                break;
125            case 'G':
126                index = handleG(value, result, index, slavoGermanic);
127                break;
128            case 'H':
129                index = handleH(value, result, index);
130                break;
131            case 'J':
132                index = handleJ(value, result, index, slavoGermanic);
133                break;
134            case 'K':
135                result.append('K');
136                index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1;
137                break;
138            case 'L':
139                index = handleL(value, result, index);
140                break;
141            case 'M':
142                result.append('M');
143                index = conditionM0(value, index) ? index + 2 : index + 1;
144                break;
145            case 'N':
146                result.append('N');
147                index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1;
148                break;
149            case '\u00D1':
150                // N with a tilde (spanish ene)
151                result.append('N');
152                index++;
153                break;
154            case 'P':
155                index = handleP(value, result, index);
156                break;
157            case 'Q':
158                result.append('K');
159                index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1;
160                break;
161            case 'R':
162                index = handleR(value, result, index, slavoGermanic);
163                break;
164            case 'S':
165                index = handleS(value, result, index, slavoGermanic);
166                break;
167            case 'T':
168                index = handleT(value, result, index);
169                break;
170            case 'V':
171                result.append('F');
172                index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1;
173                break;
174            case 'W':
175                index = handleW(value, result, index);
176                break;
177            case 'X':
178                index = handleX(value, result, index);
179                break;
180            case 'Z':
181                index = handleZ(value, result, index, slavoGermanic);
182                break;
183            default:
184                index++;
185                break;
186            }
187        }
188
189        return alternate ? result.getAlternate() : result.getPrimary();
190    }
191
192    /**
193     * Encode the value using DoubleMetaphone.  It will only work if
194     * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>).
195     *
196     * @param obj Object to encode (should be of type String)
197     * @return An encoded Object (will be of type String)
198     * @throws EncoderException encode parameter is not of type String
199     */
200    public Object encode(Object obj) throws EncoderException {
201        if (!(obj instanceof String)) {
202            throw new EncoderException("DoubleMetaphone encode parameter is not of type String");
203        }
204        return doubleMetaphone((String) obj);
205    }
206
207    /**
208     * Encode the value using DoubleMetaphone.
209     *
210     * @param value String to encode
211     * @return An encoded String
212     */
213    public String encode(String value) {
214        return doubleMetaphone(value);
215    }
216
217    /**
218     * Check if the Double Metaphone values of two <code>String</code> values
219     * are equal.
220     *
221     * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
222     * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
223     * @return <code>true</code> if the encoded <code>String</code>s are equal;
224     *          <code>false</code> otherwise.
225     * @see #isDoubleMetaphoneEqual(String,String,boolean)
226     */
227    public boolean isDoubleMetaphoneEqual(String value1, String value2) {
228        return isDoubleMetaphoneEqual(value1, value2, false);
229    }
230
231    /**
232     * Check if the Double Metaphone values of two <code>String</code> values
233     * are equal, optionally using the alternate value.
234     *
235     * @param value1 The left-hand side of the encoded {@link String#equals(Object)}.
236     * @param value2 The right-hand side of the encoded {@link String#equals(Object)}.
237     * @param alternate use the alternate value if <code>true</code>.
238     * @return <code>true</code> if the encoded <code>String</code>s are equal;
239     *          <code>false</code> otherwise.
240     */
241    public boolean isDoubleMetaphoneEqual(String value1,
242                                          String value2,
243                                          boolean alternate) {
244        return doubleMetaphone(value1, alternate).equals(doubleMetaphone
245                                                         (value2, alternate));
246    }
247
248    /**
249     * Returns the maxCodeLen.
250     * @return int
251     */
252    public int getMaxCodeLen() {
253        return this.maxCodeLen;
254    }
255
256    /**
257     * Sets the maxCodeLen.
258     * @param maxCodeLen The maxCodeLen to set
259     */
260    public void setMaxCodeLen(int maxCodeLen) {
261        this.maxCodeLen = maxCodeLen;
262    }
263
264    //-- BEGIN HANDLERS --//
265
266    /**
267     * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases
268     */
269    private int handleAEIOUY(String value, DoubleMetaphoneResult result, int
270                             index) {
271        if (index == 0) {
272            result.append('A');
273        }
274        return index + 1;
275    }
276
277    /**
278     * Handles 'C' cases
279     */
280    private int handleC(String value,
281                        DoubleMetaphoneResult result,
282                        int index) {
283        if (conditionC0(value, index)) {  // very confusing, moved out
284            result.append('K');
285            index += 2;
286        } else if (index == 0 && contains(value, index, 6, "CAESAR")) {
287            result.append('S');
288            index += 2;
289        } else if (contains(value, index, 2, "CH")) {
290            index = handleCH(value, result, index);
291        } else if (contains(value, index, 2, "CZ") &&
292                   !contains(value, index - 2, 4, "WICZ")) {
293            //-- "Czerny" --//
294            result.append('S', 'X');
295            index += 2;
296        } else if (contains(value, index + 1, 3, "CIA")) {
297            //-- "focaccia" --//
298            result.append('X');
299            index += 3;
300        } else if (contains(value, index, 2, "CC") &&
301                   !(index == 1 && charAt(value, 0) == 'M')) {
302            //-- double "cc" but not "McClelland" --//
303            return handleCC(value, result, index);
304        } else if (contains(value, index, 2, "CK", "CG", "CQ")) {
305            result.append('K');
306            index += 2;
307        } else if (contains(value, index, 2, "CI", "CE", "CY")) {
308            //-- Italian vs. English --//
309            if (contains(value, index, 3, "CIO", "CIE", "CIA")) {
310                result.append('S', 'X');
311            } else {
312                result.append('S');
313            }
314            index += 2;
315        } else {
316            result.append('K');
317            if (contains(value, index + 1, 2, " C", " Q", " G")) {
318                //-- Mac Caffrey, Mac Gregor --//
319                index += 3;
320            } else if (contains(value, index + 1, 1, "C", "K", "Q") &&
321                       !contains(value, index + 1, 2, "CE", "CI")) {
322                index += 2;
323            } else {
324                index++;
325            }
326        }
327
328        return index;
329    }
330
331    /**
332     * Handles 'CC' cases
333     */
334    private int handleCC(String value,
335                         DoubleMetaphoneResult result,
336                         int index) {
337        if (contains(value, index + 2, 1, "I", "E", "H") &&
338            !contains(value, index + 2, 2, "HU")) {
339            //-- "bellocchio" but not "bacchus" --//
340            if ((index == 1 && charAt(value, index - 1) == 'A') ||
341                contains(value, index - 1, 5, "UCCEE", "UCCES")) {
342                //-- "accident", "accede", "succeed" --//
343                result.append("KS");
344            } else {
345                //-- "bacci", "bertucci", other Italian --//
346                result.append('X');
347            }
348            index += 3;
349        } else {    // Pierce's rule
350            result.append('K');
351            index += 2;
352        }
353
354        return index;
355    }
356
357    /**
358     * Handles 'CH' cases
359     */
360    private int handleCH(String value,
361                         DoubleMetaphoneResult result,
362                         int index) {
363        if (index > 0 && contains(value, index, 4, "CHAE")) {   // Michael
364            result.append('K', 'X');
365            return index + 2;
366        } else if (conditionCH0(value, index)) {
367            //-- Greek roots ("chemistry", "chorus", etc.) --//
368            result.append('K');
369            return index + 2;
370        } else if (conditionCH1(value, index)) {
371            //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
372            result.append('K');
373            return index + 2;
374        } else {
375            if (index > 0) {
376                if (contains(value, 0, 2, "MC")) {
377                    result.append('K');
378                } else {
379                    result.append('X', 'K');
380                }
381            } else {
382                result.append('X');
383            }
384            return index + 2;
385        }
386    }
387
388    /**
389     * Handles 'D' cases
390     */
391    private int handleD(String value,
392                        DoubleMetaphoneResult result,
393                        int index) {
394        if (contains(value, index, 2, "DG")) {
395            //-- "Edge" --//
396            if (contains(value, index + 2, 1, "I", "E", "Y")) {
397                result.append('J');
398                index += 3;
399                //-- "Edgar" --//
400            } else {
401                result.append("TK");
402                index += 2;
403            }
404        } else if (contains(value, index, 2, "DT", "DD")) {
405            result.append('T');
406            index += 2;
407        } else {
408            result.append('T');
409            index++;
410        }
411        return index;
412    }
413
414    /**
415     * Handles 'G' cases
416     */
417    private int handleG(String value,
418                        DoubleMetaphoneResult result,
419                        int index,
420                        boolean slavoGermanic) {
421        if (charAt(value, index + 1) == 'H') {
422            index = handleGH(value, result, index);
423        } else if (charAt(value, index + 1) == 'N') {
424            if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) {
425                result.append("KN", "N");
426            } else if (!contains(value, index + 2, 2, "EY") &&
427                       charAt(value, index + 1) != 'Y' && !slavoGermanic) {
428                result.append("N", "KN");
429            } else {
430                result.append("KN");
431            }
432            index = index + 2;
433        } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) {
434            result.append("KL", "L");
435            index += 2;
436        } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) {
437            //-- -ges-, -gep-, -gel-, -gie- at beginning --//
438            result.append('K', 'J');
439            index += 2;
440        } else if ((contains(value, index + 1, 2, "ER") ||
441                    charAt(value, index + 1) == 'Y') &&
442                   !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
443                   !contains(value, index - 1, 1, "E", "I") &&
444                   !contains(value, index - 1, 3, "RGY", "OGY")) {
445            //-- -ger-, -gy- --//
446            result.append('K', 'J');
447            index += 2;
448        } else if (contains(value, index + 1, 1, "E", "I", "Y") ||
449                   contains(value, index - 1, 4, "AGGI", "OGGI")) {
450            //-- Italian "biaggi" --//
451            if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) {
452                //-- obvious germanic --//
453                result.append('K');
454            } else if (contains(value, index + 1, 4, "IER")) {
455                result.append('J');
456            } else {
457                result.append('J', 'K');
458            }
459            index += 2;
460        } else if (charAt(value, index + 1) == 'G') {
461            index += 2;
462            result.append('K');
463        } else {
464            index++;
465            result.append('K');
466        }
467        return index;
468    }
469
470    /**
471     * Handles 'GH' cases
472     */
473    private int handleGH(String value,
474                         DoubleMetaphoneResult result,
475                         int index) {
476        if (index > 0 && !isVowel(charAt(value, index - 1))) {
477            result.append('K');
478            index += 2;
479        } else if (index == 0) {
480            if (charAt(value, index + 2) == 'I') {
481                result.append('J');
482            } else {
483                result.append('K');
484            }
485            index += 2;
486        } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) ||
487                   (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) ||
488                   (index > 3 && contains(value, index - 4, 1, "B", "H"))) {
489            //-- Parker's rule (with some further refinements) - "hugh"
490            index += 2;
491        } else {
492            if (index > 2 && charAt(value, index - 1) == 'U' &&
493                contains(value, index - 3, 1, "C", "G", "L", "R", "T")) {
494                //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough"
495                result.append('F');
496            } else if (index > 0 && charAt(value, index - 1) != 'I') {
497                result.append('K');
498            }
499            index += 2;
500        }
501        return index;
502    }
503
504    /**
505     * Handles 'H' cases
506     */
507    private int handleH(String value,
508                        DoubleMetaphoneResult result,
509                        int index) {
510        //-- only keep if first & before vowel or between 2 vowels --//
511        if ((index == 0 || isVowel(charAt(value, index - 1))) &&
512            isVowel(charAt(value, index + 1))) {
513            result.append('H');
514            index += 2;
515            //-- also takes car of "HH" --//
516        } else {
517            index++;
518        }
519        return index;
520    }
521
522    /**
523     * Handles 'J' cases
524     */
525    private int handleJ(String value, DoubleMetaphoneResult result, int index,
526                        boolean slavoGermanic) {
527        if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) {
528                //-- obvious Spanish, "Jose", "San Jacinto" --//
529                if ((index == 0 && (charAt(value, index + 4) == ' ') ||
530                     value.length() == 4) || contains(value, 0, 4, "SAN ")) {
531                    result.append('H');
532                } else {
533                    result.append('J', 'H');
534                }
535                index++;
536            } else {
537                if (index == 0 && !contains(value, index, 4, "JOSE")) {
538                    result.append('J', 'A');
539                } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic &&
540                              (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) {
541                    result.append('J', 'H');
542                } else if (index == value.length() - 1) {
543                    result.append('J', ' ');
544                } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) {
545                    result.append('J');
546                }
547
548                if (charAt(value, index + 1) == 'J') {
549                    index += 2;
550                } else {
551                    index++;
552                }
553            }
554        return index;
555    }
556
557    /**
558     * Handles 'L' cases
559     */
560    private int handleL(String value,
561                        DoubleMetaphoneResult result,
562                        int index) {
563        result.append('L');
564        if (charAt(value, index + 1) == 'L') {
565            if (conditionL0(value, index)) {
566                result.appendAlternate(' ');
567            }
568            index += 2;
569        } else {
570            index++;
571        }
572        return index;
573    }
574
575    /**
576     * Handles 'P' cases
577     */
578    private int handleP(String value,
579                        DoubleMetaphoneResult result,
580                        int index) {
581        if (charAt(value, index + 1) == 'H') {
582            result.append('F');
583            index += 2;
584        } else {
585            result.append('P');
586            index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1;
587        }
588        return index;
589    }
590
591    /**
592     * Handles 'R' cases
593     */
594    private int handleR(String value,
595                        DoubleMetaphoneResult result,
596                        int index,
597                        boolean slavoGermanic) {
598        if (index == value.length() - 1 && !slavoGermanic &&
599            contains(value, index - 2, 2, "IE") &&
600            !contains(value, index - 4, 2, "ME", "MA")) {
601            result.appendAlternate('R');
602        } else {
603            result.append('R');
604        }
605        return charAt(value, index + 1) == 'R' ? index + 2 : index + 1;
606    }
607
608    /**
609     * Handles 'S' cases
610     */
611    private int handleS(String value,
612                        DoubleMetaphoneResult result,
613                        int index,
614                        boolean slavoGermanic) {
615        if (contains(value, index - 1, 3, "ISL", "YSL")) {
616            //-- special cases "island", "isle", "carlisle", "carlysle" --//
617            index++;
618        } else if (index == 0 && contains(value, index, 5, "SUGAR")) {
619            //-- special case "sugar-" --//
620            result.append('X', 'S');
621            index++;
622        } else if (contains(value, index, 2, "SH")) {
623            if (contains(value, index + 1, 4,
624                         "HEIM", "HOEK", "HOLM", "HOLZ")) {
625                //-- germanic --//
626                result.append('S');
627            } else {
628                result.append('X');
629            }
630            index += 2;
631        } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) {
632            //-- Italian and Armenian --//
633            if (slavoGermanic) {
634                result.append('S');
635            } else {
636                result.append('S', 'X');
637            }
638            index += 3;
639        } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) {
640            //-- german & anglicisations, e.g. "smith" match "schmidt" //
641            // "snider" match "schneider" --//
642            //-- also, -sz- in slavic language altho in hungarian it //
643            //   is pronounced "s" --//
644            result.append('S', 'X');
645            index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1;
646        } else if (contains(value, index, 2, "SC")) {
647            index = handleSC(value, result, index);
648        } else {
649            if (index == value.length() - 1 && contains(value, index - 2,
650                                                        2, "AI", "OI")){
651                //-- french e.g. "resnais", "artois" --//
652                result.appendAlternate('S');
653            } else {
654                result.append('S');
655            }
656            index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1;
657        }
658        return index;
659    }
660
661    /**
662     * Handles 'SC' cases
663     */
664    private int handleSC(String value,
665                         DoubleMetaphoneResult result,
666                         int index) {
667        if (charAt(value, index + 2) == 'H') {
668            //-- Schlesinger's rule --//
669            if (contains(value, index + 3,
670                         2, "OO", "ER", "EN", "UY", "ED", "EM")) {
671                //-- Dutch origin, e.g. "school", "schooner" --//
672                if (contains(value, index + 3, 2, "ER", "EN")) {
673                    //-- "schermerhorn", "schenker" --//
674                    result.append("X", "SK");
675                } else {
676                    result.append("SK");
677                }
678            } else {
679                if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') {
680                    result.append('X', 'S');
681                } else {
682                    result.append('X');
683                }
684            }
685        } else if (contains(value, index + 2, 1, "I", "E", "Y")) {
686            result.append('S');
687        } else {
688            result.append("SK");
689        }
690        return index + 3;
691    }
692
693    /**
694     * Handles 'T' cases
695     */
696    private int handleT(String value,
697                        DoubleMetaphoneResult result,
698                        int index) {
699        if (contains(value, index, 4, "TION")) {
700            result.append('X');
701            index += 3;
702        } else if (contains(value, index, 3, "TIA", "TCH")) {
703            result.append('X');
704            index += 3;
705        } else if (contains(value, index, 2, "TH") || contains(value, index,
706                                                               3, "TTH")) {
707            if (contains(value, index + 2, 2, "OM", "AM") ||
708                //-- special case "thomas", "thames" or germanic --//
709                contains(value, 0, 4, "VAN ", "VON ") ||
710                contains(value, 0, 3, "SCH")) {
711                result.append('T');
712            } else {
713                result.append('0', 'T');
714            }
715            index += 2;
716        } else {
717            result.append('T');
718            index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1;
719        }
720        return index;
721    }
722
723    /**
724     * Handles 'W' cases
725     */
726    private int handleW(String value,
727                        DoubleMetaphoneResult result,
728                        int index) {
729        if (contains(value, index, 2, "WR")) {
730            //-- can also be in middle of word --//
731            result.append('R');
732            index += 2;
733        } else {
734            if (index == 0 && (isVowel(charAt(value, index + 1)) ||
735                               contains(value, index, 2, "WH"))) {
736                if (isVowel(charAt(value, index + 1))) {
737                    //-- Wasserman should match Vasserman --//
738                    result.append('A', 'F');
739                } else {
740                    //-- need Uomo to match Womo --//
741                    result.append('A');
742                }
743                index++;
744            } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) ||
745                       contains(value, index - 1,
746                                5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
747                       contains(value, 0, 3, "SCH")) {
748                //-- Arnow should match Arnoff --//
749                result.appendAlternate('F');
750                index++;
751            } else if (contains(value, index, 4, "WICZ", "WITZ")) {
752                //-- Polish e.g. "filipowicz" --//
753                result.append("TS", "FX");
754                index += 4;
755            } else {
756                index++;
757            }
758        }
759        return index;
760    }
761
762    /**
763     * Handles 'X' cases
764     */
765    private int handleX(String value,
766                        DoubleMetaphoneResult result,
767                        int index) {
768        if (index == 0) {
769            result.append('S');
770            index++;
771        } else {
772            if (!((index == value.length() - 1) &&
773                  (contains(value, index - 3, 3, "IAU", "EAU") ||
774                   contains(value, index - 2, 2, "AU", "OU")))) {
775                //-- French e.g. breaux --//
776                result.append("KS");
777            }
778            index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1;
779        }
780        return index;
781    }
782
783    /**
784     * Handles 'Z' cases
785     */
786    private int handleZ(String value, DoubleMetaphoneResult result, int index,
787                        boolean slavoGermanic) {
788        if (charAt(value, index + 1) == 'H') {
789            //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
790            result.append('J');
791            index += 2;
792        } else {
793            if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) {
794                result.append("S", "TS");
795            } else {
796                result.append('S');
797            }
798            index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1;
799        }
800        return index;
801    }
802
803    //-- BEGIN CONDITIONS --//
804
805    /**
806     * Complex condition 0 for 'C'
807     */
808    private boolean conditionC0(String value, int index) {
809        if (contains(value, index, 4, "CHIA")) {
810            return true;
811        } else if (index <= 1) {
812            return false;
813        } else if (isVowel(charAt(value, index - 2))) {
814            return false;
815        } else if (!contains(value, index - 1, 3, "ACH")) {
816            return false;
817        } else {
818            char c = charAt(value, index + 2);
819            return (c != 'I' && c != 'E')
820                    || contains(value, index - 2, 6, "BACHER", "MACHER");
821        }
822    }
823
824    /**
825     * Complex condition 0 for 'CH'
826     */
827    private boolean conditionCH0(String value, int index) {
828        if (index != 0) {
829            return false;
830        } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") &&
831                   !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) {
832            return false;
833        } else if (contains(value, 0, 5, "CHORE")) {
834            return false;
835        } else {
836            return true;
837        }
838    }
839
840    /**
841     * Complex condition 1 for 'CH'
842     */
843    private boolean conditionCH1(String value, int index) {
844        return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0,
845                                                                   3, "SCH")) ||
846                contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
847                contains(value, index + 2, 1, "T", "S") ||
848                ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) &&
849                 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1)));
850    }
851
852    /**
853     * Complex condition 0 for 'L'
854     */
855    private boolean conditionL0(String value, int index) {
856        if (index == value.length() - 3 &&
857            contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) {
858            return true;
859        } else if ((contains(value, index - 1, 2, "AS", "OS") ||
860                    contains(value, value.length() - 1, 1, "A", "O")) &&
861                   contains(value, index - 1, 4, "ALLE")) {
862            return true;
863        } else {
864            return false;
865        }
866    }
867
868    /**
869     * Complex condition 0 for 'M'
870     */
871    private boolean conditionM0(String value, int index) {
872        if (charAt(value, index + 1) == 'M') {
873            return true;
874        }
875        return contains(value, index - 1, 3, "UMB")
876                && ((index + 1) == value.length() - 1 || contains(value,
877                        index + 2, 2, "ER"));
878    }
879
880    //-- BEGIN HELPER FUNCTIONS --//
881
882    /**
883     * Determines whether or not a value is of slavo-germanic orgin. A value is
884     * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'.
885     */
886    private boolean isSlavoGermanic(String value) {
887        return value.indexOf('W') > -1 || value.indexOf('K') > -1 ||
888            value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1;
889    }
890
891    /**
892     * Determines whether or not a character is a vowel or not
893     */
894    private boolean isVowel(char ch) {
895        return VOWELS.indexOf(ch) != -1;
896    }
897
898    /**
899     * Determines whether or not the value starts with a silent letter.  It will
900     * return <code>true</code> if the value starts with any of 'GN', 'KN',
901     * 'PN', 'WR' or 'PS'.
902     */
903    private boolean isSilentStart(String value) {
904        boolean result = false;
905        for (int i = 0; i < SILENT_START.length; i++) {
906            if (value.startsWith(SILENT_START[i])) {
907                result = true;
908                break;
909            }
910        }
911        return result;
912    }
913
914    /**
915     * Cleans the input
916     */
917    private String cleanInput(String input) {
918        if (input == null) {
919            return null;
920        }
921        input = input.trim();
922        if (input.length() == 0) {
923            return null;
924        }
925        return input.toUpperCase();
926    }
927
928    /**
929     * Gets the character at index <code>index</code> if available, otherwise
930     * it returns <code>Character.MIN_VALUE</code> so that there is some sort
931     * of a default
932     */
933    protected char charAt(String value, int index) {
934        if (index < 0 || index >= value.length()) {
935            return Character.MIN_VALUE;
936        }
937        return value.charAt(index);
938    }
939
940    /**
941     * Shortcut method with 1 criteria
942     */
943    private static boolean contains(String value, int start, int length,
944                                    String criteria) {
945        return contains(value, start, length,
946                        new String[] { criteria });
947    }
948
949    /**
950     * Shortcut method with 2 criteria
951     */
952    private static boolean contains(String value, int start, int length,
953                                    String criteria1, String criteria2) {
954        return contains(value, start, length,
955                        new String[] { criteria1, criteria2 });
956    }
957
958    /**
959     * Shortcut method with 3 criteria
960     */
961    private static boolean contains(String value, int start, int length,
962                                    String criteria1, String criteria2,
963                                    String criteria3) {
964        return contains(value, start, length,
965                        new String[] { criteria1, criteria2, criteria3 });
966    }
967
968    /**
969     * Shortcut method with 4 criteria
970     */
971    private static boolean contains(String value, int start, int length,
972                                    String criteria1, String criteria2,
973                                    String criteria3, String criteria4) {
974        return contains(value, start, length,
975                        new String[] { criteria1, criteria2, criteria3,
976                                       criteria4 });
977    }
978
979    /**
980     * Shortcut method with 5 criteria
981     */
982    private static boolean contains(String value, int start, int length,
983                                    String criteria1, String criteria2,
984                                    String criteria3, String criteria4,
985                                    String criteria5) {
986        return contains(value, start, length,
987                        new String[] { criteria1, criteria2, criteria3,
988                                       criteria4, criteria5 });
989    }
990
991    /**
992     * Shortcut method with 6 criteria
993     */
994    private static boolean contains(String value, int start, int length,
995                                    String criteria1, String criteria2,
996                                    String criteria3, String criteria4,
997                                    String criteria5, String criteria6) {
998        return contains(value, start, length,
999                        new String[] { criteria1, criteria2, criteria3,
1000                                       criteria4, criteria5, criteria6 });
1001    }
1002
1003    /**
1004     * Determines whether <code>value</code> contains any of the criteria
1005     starting
1006     * at index <code>start</code> and matching up to length <code>length</code>
1007     */
1008    protected static boolean contains(String value, int start, int length,
1009                                      String[] criteria) {
1010        boolean result = false;
1011        if (start >= 0 && start + length <= value.length()) {
1012            String target = value.substring(start, start + length);
1013
1014            for (int i = 0; i < criteria.length; i++) {
1015                if (target.equals(criteria[i])) {
1016                    result = true;
1017                    break;
1018                }
1019            }
1020        }
1021        return result;
1022    }
1023
1024    //-- BEGIN INNER CLASSES --//
1025
1026    /**
1027     * Inner class for storing results, since there is the optional alternate
1028     * encoding.
1029     */
1030    public class DoubleMetaphoneResult {
1031
1032        private StringBuffer primary = new StringBuffer(getMaxCodeLen());
1033        private StringBuffer alternate = new StringBuffer(getMaxCodeLen());
1034        private int maxLength;
1035
1036        public DoubleMetaphoneResult(int maxLength) {
1037            this.maxLength = maxLength;
1038        }
1039
1040        public void append(char value) {
1041            appendPrimary(value);
1042            appendAlternate(value);
1043        }
1044
1045        public void append(char primary, char alternate) {
1046            appendPrimary(primary);
1047            appendAlternate(alternate);
1048        }
1049
1050        public void appendPrimary(char value) {
1051            if (this.primary.length() < this.maxLength) {
1052                this.primary.append(value);
1053            }
1054        }
1055
1056        public void appendAlternate(char value) {
1057            if (this.alternate.length() < this.maxLength) {
1058                this.alternate.append(value);
1059            }
1060        }
1061
1062        public void append(String value) {
1063            appendPrimary(value);
1064            appendAlternate(value);
1065        }
1066
1067        public void append(String primary, String alternate) {
1068            appendPrimary(primary);
1069            appendAlternate(alternate);
1070        }
1071
1072        public void appendPrimary(String value) {
1073            int addChars = this.maxLength - this.primary.length();
1074            if (value.length() <= addChars) {
1075                this.primary.append(value);
1076            } else {
1077                this.primary.append(value.substring(0, addChars));
1078            }
1079        }
1080
1081        public void appendAlternate(String value) {
1082            int addChars = this.maxLength - this.alternate.length();
1083            if (value.length() <= addChars) {
1084                this.alternate.append(value);
1085            } else {
1086                this.alternate.append(value.substring(0, addChars));
1087            }
1088        }
1089
1090        public String getPrimary() {
1091            return this.primary.toString();
1092        }
1093
1094        public String getAlternate() {
1095            return this.alternate.toString();
1096        }
1097
1098        public boolean isComplete() {
1099            return this.primary.length() >= this.maxLength &&
1100                this.alternate.length() >= this.maxLength;
1101        }
1102    }
1103}
1104