1/* 2 * Copyright 2001-2004 The Apache Software Foundation. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package org.apache.commons.codec.language; 18 19import org.apache.commons.codec.EncoderException; 20import org.apache.commons.codec.StringEncoder; 21 22/** 23 * Encodes a string into a double metaphone value. 24 * This Implementation is based on the algorithm by <CITE>Lawrence Philips</CITE>. 25 * <ul> 26 * <li>Original Article: <a 27 * href="http://www.cuj.com/documents/s=8038/cuj0006philips/"> 28 * http://www.cuj.com/documents/s=8038/cuj0006philips/</a></li> 29 * <li>Original Source Code: <a href="ftp://ftp.cuj.com/pub/2000/1806/philips.zip"> 30 * ftp://ftp.cuj.com/pub/2000/1806/philips.zip</a></li> 31 * </ul> 32 * 33 * @author Apache Software Foundation 34 * @version $Id: DoubleMetaphone.java,v 1.24 2004/06/05 18:32:04 ggregory Exp $ 35 */ 36public class DoubleMetaphone implements StringEncoder { 37 38 /** 39 * "Vowels" to test for 40 */ 41 private static final String VOWELS = "AEIOUY"; 42 43 /** 44 * Prefixes when present which are not pronounced 45 */ 46 private static final String[] SILENT_START = 47 { "GN", "KN", "PN", "WR", "PS" }; 48 private static final String[] L_R_N_M_B_H_F_V_W_SPACE = 49 { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; 50 private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = 51 { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; 52 private static final String[] L_T_K_S_N_M_B_Z = 53 { "L", "T", "K", "S", "N", "M", "B", "Z" }; 54 55 /** 56 * Maximum length of an encoding, default is 4 57 */ 58 protected int maxCodeLen = 4; 59 60 /** 61 * Creates an instance of this DoubleMetaphone encoder 62 */ 63 public DoubleMetaphone() { 64 super(); 65 } 66 67 /** 68 * Encode a value with Double Metaphone 69 * 70 * @param value String to encode 71 * @return an encoded string 72 */ 73 public String doubleMetaphone(String value) { 74 return doubleMetaphone(value, false); 75 } 76 77 /** 78 * Encode a value with Double Metaphone, optionally using the alternate 79 * encoding. 80 * 81 * @param value String to encode 82 * @param alternate use alternate encode 83 * @return an encoded string 84 */ 85 public String doubleMetaphone(String value, boolean alternate) { 86 value = cleanInput(value); 87 if (value == null) { 88 return null; 89 } 90 91 boolean slavoGermanic = isSlavoGermanic(value); 92 int index = isSilentStart(value) ? 1 : 0; 93 94 DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); 95 96 while (!result.isComplete() && index <= value.length() - 1) { 97 switch (value.charAt(index)) { 98 case 'A': 99 case 'E': 100 case 'I': 101 case 'O': 102 case 'U': 103 case 'Y': 104 index = handleAEIOUY(value, result, index); 105 break; 106 case 'B': 107 result.append('P'); 108 index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; 109 break; 110 case '\u00C7': 111 // A C with a Cedilla 112 result.append('S'); 113 index++; 114 break; 115 case 'C': 116 index = handleC(value, result, index); 117 break; 118 case 'D': 119 index = handleD(value, result, index); 120 break; 121 case 'F': 122 result.append('F'); 123 index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; 124 break; 125 case 'G': 126 index = handleG(value, result, index, slavoGermanic); 127 break; 128 case 'H': 129 index = handleH(value, result, index); 130 break; 131 case 'J': 132 index = handleJ(value, result, index, slavoGermanic); 133 break; 134 case 'K': 135 result.append('K'); 136 index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; 137 break; 138 case 'L': 139 index = handleL(value, result, index); 140 break; 141 case 'M': 142 result.append('M'); 143 index = conditionM0(value, index) ? index + 2 : index + 1; 144 break; 145 case 'N': 146 result.append('N'); 147 index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; 148 break; 149 case '\u00D1': 150 // N with a tilde (spanish ene) 151 result.append('N'); 152 index++; 153 break; 154 case 'P': 155 index = handleP(value, result, index); 156 break; 157 case 'Q': 158 result.append('K'); 159 index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; 160 break; 161 case 'R': 162 index = handleR(value, result, index, slavoGermanic); 163 break; 164 case 'S': 165 index = handleS(value, result, index, slavoGermanic); 166 break; 167 case 'T': 168 index = handleT(value, result, index); 169 break; 170 case 'V': 171 result.append('F'); 172 index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; 173 break; 174 case 'W': 175 index = handleW(value, result, index); 176 break; 177 case 'X': 178 index = handleX(value, result, index); 179 break; 180 case 'Z': 181 index = handleZ(value, result, index, slavoGermanic); 182 break; 183 default: 184 index++; 185 break; 186 } 187 } 188 189 return alternate ? result.getAlternate() : result.getPrimary(); 190 } 191 192 /** 193 * Encode the value using DoubleMetaphone. It will only work if 194 * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). 195 * 196 * @param obj Object to encode (should be of type String) 197 * @return An encoded Object (will be of type String) 198 * @throws EncoderException encode parameter is not of type String 199 */ 200 public Object encode(Object obj) throws EncoderException { 201 if (!(obj instanceof String)) { 202 throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); 203 } 204 return doubleMetaphone((String) obj); 205 } 206 207 /** 208 * Encode the value using DoubleMetaphone. 209 * 210 * @param value String to encode 211 * @return An encoded String 212 */ 213 public String encode(String value) { 214 return doubleMetaphone(value); 215 } 216 217 /** 218 * Check if the Double Metaphone values of two <code>String</code> values 219 * are equal. 220 * 221 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 222 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 223 * @return <code>true</code> if the encoded <code>String</code>s are equal; 224 * <code>false</code> otherwise. 225 * @see #isDoubleMetaphoneEqual(String,String,boolean) 226 */ 227 public boolean isDoubleMetaphoneEqual(String value1, String value2) { 228 return isDoubleMetaphoneEqual(value1, value2, false); 229 } 230 231 /** 232 * Check if the Double Metaphone values of two <code>String</code> values 233 * are equal, optionally using the alternate value. 234 * 235 * @param value1 The left-hand side of the encoded {@link String#equals(Object)}. 236 * @param value2 The right-hand side of the encoded {@link String#equals(Object)}. 237 * @param alternate use the alternate value if <code>true</code>. 238 * @return <code>true</code> if the encoded <code>String</code>s are equal; 239 * <code>false</code> otherwise. 240 */ 241 public boolean isDoubleMetaphoneEqual(String value1, 242 String value2, 243 boolean alternate) { 244 return doubleMetaphone(value1, alternate).equals(doubleMetaphone 245 (value2, alternate)); 246 } 247 248 /** 249 * Returns the maxCodeLen. 250 * @return int 251 */ 252 public int getMaxCodeLen() { 253 return this.maxCodeLen; 254 } 255 256 /** 257 * Sets the maxCodeLen. 258 * @param maxCodeLen The maxCodeLen to set 259 */ 260 public void setMaxCodeLen(int maxCodeLen) { 261 this.maxCodeLen = maxCodeLen; 262 } 263 264 //-- BEGIN HANDLERS --// 265 266 /** 267 * Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases 268 */ 269 private int handleAEIOUY(String value, DoubleMetaphoneResult result, int 270 index) { 271 if (index == 0) { 272 result.append('A'); 273 } 274 return index + 1; 275 } 276 277 /** 278 * Handles 'C' cases 279 */ 280 private int handleC(String value, 281 DoubleMetaphoneResult result, 282 int index) { 283 if (conditionC0(value, index)) { // very confusing, moved out 284 result.append('K'); 285 index += 2; 286 } else if (index == 0 && contains(value, index, 6, "CAESAR")) { 287 result.append('S'); 288 index += 2; 289 } else if (contains(value, index, 2, "CH")) { 290 index = handleCH(value, result, index); 291 } else if (contains(value, index, 2, "CZ") && 292 !contains(value, index - 2, 4, "WICZ")) { 293 //-- "Czerny" --// 294 result.append('S', 'X'); 295 index += 2; 296 } else if (contains(value, index + 1, 3, "CIA")) { 297 //-- "focaccia" --// 298 result.append('X'); 299 index += 3; 300 } else if (contains(value, index, 2, "CC") && 301 !(index == 1 && charAt(value, 0) == 'M')) { 302 //-- double "cc" but not "McClelland" --// 303 return handleCC(value, result, index); 304 } else if (contains(value, index, 2, "CK", "CG", "CQ")) { 305 result.append('K'); 306 index += 2; 307 } else if (contains(value, index, 2, "CI", "CE", "CY")) { 308 //-- Italian vs. English --// 309 if (contains(value, index, 3, "CIO", "CIE", "CIA")) { 310 result.append('S', 'X'); 311 } else { 312 result.append('S'); 313 } 314 index += 2; 315 } else { 316 result.append('K'); 317 if (contains(value, index + 1, 2, " C", " Q", " G")) { 318 //-- Mac Caffrey, Mac Gregor --// 319 index += 3; 320 } else if (contains(value, index + 1, 1, "C", "K", "Q") && 321 !contains(value, index + 1, 2, "CE", "CI")) { 322 index += 2; 323 } else { 324 index++; 325 } 326 } 327 328 return index; 329 } 330 331 /** 332 * Handles 'CC' cases 333 */ 334 private int handleCC(String value, 335 DoubleMetaphoneResult result, 336 int index) { 337 if (contains(value, index + 2, 1, "I", "E", "H") && 338 !contains(value, index + 2, 2, "HU")) { 339 //-- "bellocchio" but not "bacchus" --// 340 if ((index == 1 && charAt(value, index - 1) == 'A') || 341 contains(value, index - 1, 5, "UCCEE", "UCCES")) { 342 //-- "accident", "accede", "succeed" --// 343 result.append("KS"); 344 } else { 345 //-- "bacci", "bertucci", other Italian --// 346 result.append('X'); 347 } 348 index += 3; 349 } else { // Pierce's rule 350 result.append('K'); 351 index += 2; 352 } 353 354 return index; 355 } 356 357 /** 358 * Handles 'CH' cases 359 */ 360 private int handleCH(String value, 361 DoubleMetaphoneResult result, 362 int index) { 363 if (index > 0 && contains(value, index, 4, "CHAE")) { // Michael 364 result.append('K', 'X'); 365 return index + 2; 366 } else if (conditionCH0(value, index)) { 367 //-- Greek roots ("chemistry", "chorus", etc.) --// 368 result.append('K'); 369 return index + 2; 370 } else if (conditionCH1(value, index)) { 371 //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// 372 result.append('K'); 373 return index + 2; 374 } else { 375 if (index > 0) { 376 if (contains(value, 0, 2, "MC")) { 377 result.append('K'); 378 } else { 379 result.append('X', 'K'); 380 } 381 } else { 382 result.append('X'); 383 } 384 return index + 2; 385 } 386 } 387 388 /** 389 * Handles 'D' cases 390 */ 391 private int handleD(String value, 392 DoubleMetaphoneResult result, 393 int index) { 394 if (contains(value, index, 2, "DG")) { 395 //-- "Edge" --// 396 if (contains(value, index + 2, 1, "I", "E", "Y")) { 397 result.append('J'); 398 index += 3; 399 //-- "Edgar" --// 400 } else { 401 result.append("TK"); 402 index += 2; 403 } 404 } else if (contains(value, index, 2, "DT", "DD")) { 405 result.append('T'); 406 index += 2; 407 } else { 408 result.append('T'); 409 index++; 410 } 411 return index; 412 } 413 414 /** 415 * Handles 'G' cases 416 */ 417 private int handleG(String value, 418 DoubleMetaphoneResult result, 419 int index, 420 boolean slavoGermanic) { 421 if (charAt(value, index + 1) == 'H') { 422 index = handleGH(value, result, index); 423 } else if (charAt(value, index + 1) == 'N') { 424 if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { 425 result.append("KN", "N"); 426 } else if (!contains(value, index + 2, 2, "EY") && 427 charAt(value, index + 1) != 'Y' && !slavoGermanic) { 428 result.append("N", "KN"); 429 } else { 430 result.append("KN"); 431 } 432 index = index + 2; 433 } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { 434 result.append("KL", "L"); 435 index += 2; 436 } else if (index == 0 && (charAt(value, index + 1) == 'Y' || contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { 437 //-- -ges-, -gep-, -gel-, -gie- at beginning --// 438 result.append('K', 'J'); 439 index += 2; 440 } else if ((contains(value, index + 1, 2, "ER") || 441 charAt(value, index + 1) == 'Y') && 442 !contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && 443 !contains(value, index - 1, 1, "E", "I") && 444 !contains(value, index - 1, 3, "RGY", "OGY")) { 445 //-- -ger-, -gy- --// 446 result.append('K', 'J'); 447 index += 2; 448 } else if (contains(value, index + 1, 1, "E", "I", "Y") || 449 contains(value, index - 1, 4, "AGGI", "OGGI")) { 450 //-- Italian "biaggi" --// 451 if ((contains(value, 0 ,4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || contains(value, index + 1, 2, "ET")) { 452 //-- obvious germanic --// 453 result.append('K'); 454 } else if (contains(value, index + 1, 4, "IER")) { 455 result.append('J'); 456 } else { 457 result.append('J', 'K'); 458 } 459 index += 2; 460 } else if (charAt(value, index + 1) == 'G') { 461 index += 2; 462 result.append('K'); 463 } else { 464 index++; 465 result.append('K'); 466 } 467 return index; 468 } 469 470 /** 471 * Handles 'GH' cases 472 */ 473 private int handleGH(String value, 474 DoubleMetaphoneResult result, 475 int index) { 476 if (index > 0 && !isVowel(charAt(value, index - 1))) { 477 result.append('K'); 478 index += 2; 479 } else if (index == 0) { 480 if (charAt(value, index + 2) == 'I') { 481 result.append('J'); 482 } else { 483 result.append('K'); 484 } 485 index += 2; 486 } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || 487 (index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || 488 (index > 3 && contains(value, index - 4, 1, "B", "H"))) { 489 //-- Parker's rule (with some further refinements) - "hugh" 490 index += 2; 491 } else { 492 if (index > 2 && charAt(value, index - 1) == 'U' && 493 contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { 494 //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" 495 result.append('F'); 496 } else if (index > 0 && charAt(value, index - 1) != 'I') { 497 result.append('K'); 498 } 499 index += 2; 500 } 501 return index; 502 } 503 504 /** 505 * Handles 'H' cases 506 */ 507 private int handleH(String value, 508 DoubleMetaphoneResult result, 509 int index) { 510 //-- only keep if first & before vowel or between 2 vowels --// 511 if ((index == 0 || isVowel(charAt(value, index - 1))) && 512 isVowel(charAt(value, index + 1))) { 513 result.append('H'); 514 index += 2; 515 //-- also takes car of "HH" --// 516 } else { 517 index++; 518 } 519 return index; 520 } 521 522 /** 523 * Handles 'J' cases 524 */ 525 private int handleJ(String value, DoubleMetaphoneResult result, int index, 526 boolean slavoGermanic) { 527 if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { 528 //-- obvious Spanish, "Jose", "San Jacinto" --// 529 if ((index == 0 && (charAt(value, index + 4) == ' ') || 530 value.length() == 4) || contains(value, 0, 4, "SAN ")) { 531 result.append('H'); 532 } else { 533 result.append('J', 'H'); 534 } 535 index++; 536 } else { 537 if (index == 0 && !contains(value, index, 4, "JOSE")) { 538 result.append('J', 'A'); 539 } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && 540 (charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { 541 result.append('J', 'H'); 542 } else if (index == value.length() - 1) { 543 result.append('J', ' '); 544 } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && !contains(value, index - 1, 1, "S", "K", "L")) { 545 result.append('J'); 546 } 547 548 if (charAt(value, index + 1) == 'J') { 549 index += 2; 550 } else { 551 index++; 552 } 553 } 554 return index; 555 } 556 557 /** 558 * Handles 'L' cases 559 */ 560 private int handleL(String value, 561 DoubleMetaphoneResult result, 562 int index) { 563 result.append('L'); 564 if (charAt(value, index + 1) == 'L') { 565 if (conditionL0(value, index)) { 566 result.appendAlternate(' '); 567 } 568 index += 2; 569 } else { 570 index++; 571 } 572 return index; 573 } 574 575 /** 576 * Handles 'P' cases 577 */ 578 private int handleP(String value, 579 DoubleMetaphoneResult result, 580 int index) { 581 if (charAt(value, index + 1) == 'H') { 582 result.append('F'); 583 index += 2; 584 } else { 585 result.append('P'); 586 index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; 587 } 588 return index; 589 } 590 591 /** 592 * Handles 'R' cases 593 */ 594 private int handleR(String value, 595 DoubleMetaphoneResult result, 596 int index, 597 boolean slavoGermanic) { 598 if (index == value.length() - 1 && !slavoGermanic && 599 contains(value, index - 2, 2, "IE") && 600 !contains(value, index - 4, 2, "ME", "MA")) { 601 result.appendAlternate('R'); 602 } else { 603 result.append('R'); 604 } 605 return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; 606 } 607 608 /** 609 * Handles 'S' cases 610 */ 611 private int handleS(String value, 612 DoubleMetaphoneResult result, 613 int index, 614 boolean slavoGermanic) { 615 if (contains(value, index - 1, 3, "ISL", "YSL")) { 616 //-- special cases "island", "isle", "carlisle", "carlysle" --// 617 index++; 618 } else if (index == 0 && contains(value, index, 5, "SUGAR")) { 619 //-- special case "sugar-" --// 620 result.append('X', 'S'); 621 index++; 622 } else if (contains(value, index, 2, "SH")) { 623 if (contains(value, index + 1, 4, 624 "HEIM", "HOEK", "HOLM", "HOLZ")) { 625 //-- germanic --// 626 result.append('S'); 627 } else { 628 result.append('X'); 629 } 630 index += 2; 631 } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { 632 //-- Italian and Armenian --// 633 if (slavoGermanic) { 634 result.append('S'); 635 } else { 636 result.append('S', 'X'); 637 } 638 index += 3; 639 } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || contains(value, index + 1, 1, "Z")) { 640 //-- german & anglicisations, e.g. "smith" match "schmidt" // 641 // "snider" match "schneider" --// 642 //-- also, -sz- in slavic language altho in hungarian it // 643 // is pronounced "s" --// 644 result.append('S', 'X'); 645 index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; 646 } else if (contains(value, index, 2, "SC")) { 647 index = handleSC(value, result, index); 648 } else { 649 if (index == value.length() - 1 && contains(value, index - 2, 650 2, "AI", "OI")){ 651 //-- french e.g. "resnais", "artois" --// 652 result.appendAlternate('S'); 653 } else { 654 result.append('S'); 655 } 656 index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; 657 } 658 return index; 659 } 660 661 /** 662 * Handles 'SC' cases 663 */ 664 private int handleSC(String value, 665 DoubleMetaphoneResult result, 666 int index) { 667 if (charAt(value, index + 2) == 'H') { 668 //-- Schlesinger's rule --// 669 if (contains(value, index + 3, 670 2, "OO", "ER", "EN", "UY", "ED", "EM")) { 671 //-- Dutch origin, e.g. "school", "schooner" --// 672 if (contains(value, index + 3, 2, "ER", "EN")) { 673 //-- "schermerhorn", "schenker" --// 674 result.append("X", "SK"); 675 } else { 676 result.append("SK"); 677 } 678 } else { 679 if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { 680 result.append('X', 'S'); 681 } else { 682 result.append('X'); 683 } 684 } 685 } else if (contains(value, index + 2, 1, "I", "E", "Y")) { 686 result.append('S'); 687 } else { 688 result.append("SK"); 689 } 690 return index + 3; 691 } 692 693 /** 694 * Handles 'T' cases 695 */ 696 private int handleT(String value, 697 DoubleMetaphoneResult result, 698 int index) { 699 if (contains(value, index, 4, "TION")) { 700 result.append('X'); 701 index += 3; 702 } else if (contains(value, index, 3, "TIA", "TCH")) { 703 result.append('X'); 704 index += 3; 705 } else if (contains(value, index, 2, "TH") || contains(value, index, 706 3, "TTH")) { 707 if (contains(value, index + 2, 2, "OM", "AM") || 708 //-- special case "thomas", "thames" or germanic --// 709 contains(value, 0, 4, "VAN ", "VON ") || 710 contains(value, 0, 3, "SCH")) { 711 result.append('T'); 712 } else { 713 result.append('0', 'T'); 714 } 715 index += 2; 716 } else { 717 result.append('T'); 718 index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; 719 } 720 return index; 721 } 722 723 /** 724 * Handles 'W' cases 725 */ 726 private int handleW(String value, 727 DoubleMetaphoneResult result, 728 int index) { 729 if (contains(value, index, 2, "WR")) { 730 //-- can also be in middle of word --// 731 result.append('R'); 732 index += 2; 733 } else { 734 if (index == 0 && (isVowel(charAt(value, index + 1)) || 735 contains(value, index, 2, "WH"))) { 736 if (isVowel(charAt(value, index + 1))) { 737 //-- Wasserman should match Vasserman --// 738 result.append('A', 'F'); 739 } else { 740 //-- need Uomo to match Womo --// 741 result.append('A'); 742 } 743 index++; 744 } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || 745 contains(value, index - 1, 746 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 747 contains(value, 0, 3, "SCH")) { 748 //-- Arnow should match Arnoff --// 749 result.appendAlternate('F'); 750 index++; 751 } else if (contains(value, index, 4, "WICZ", "WITZ")) { 752 //-- Polish e.g. "filipowicz" --// 753 result.append("TS", "FX"); 754 index += 4; 755 } else { 756 index++; 757 } 758 } 759 return index; 760 } 761 762 /** 763 * Handles 'X' cases 764 */ 765 private int handleX(String value, 766 DoubleMetaphoneResult result, 767 int index) { 768 if (index == 0) { 769 result.append('S'); 770 index++; 771 } else { 772 if (!((index == value.length() - 1) && 773 (contains(value, index - 3, 3, "IAU", "EAU") || 774 contains(value, index - 2, 2, "AU", "OU")))) { 775 //-- French e.g. breaux --// 776 result.append("KS"); 777 } 778 index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; 779 } 780 return index; 781 } 782 783 /** 784 * Handles 'Z' cases 785 */ 786 private int handleZ(String value, DoubleMetaphoneResult result, int index, 787 boolean slavoGermanic) { 788 if (charAt(value, index + 1) == 'H') { 789 //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// 790 result.append('J'); 791 index += 2; 792 } else { 793 if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || (slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { 794 result.append("S", "TS"); 795 } else { 796 result.append('S'); 797 } 798 index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; 799 } 800 return index; 801 } 802 803 //-- BEGIN CONDITIONS --// 804 805 /** 806 * Complex condition 0 for 'C' 807 */ 808 private boolean conditionC0(String value, int index) { 809 if (contains(value, index, 4, "CHIA")) { 810 return true; 811 } else if (index <= 1) { 812 return false; 813 } else if (isVowel(charAt(value, index - 2))) { 814 return false; 815 } else if (!contains(value, index - 1, 3, "ACH")) { 816 return false; 817 } else { 818 char c = charAt(value, index + 2); 819 return (c != 'I' && c != 'E') 820 || contains(value, index - 2, 6, "BACHER", "MACHER"); 821 } 822 } 823 824 /** 825 * Complex condition 0 for 'CH' 826 */ 827 private boolean conditionCH0(String value, int index) { 828 if (index != 0) { 829 return false; 830 } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && 831 !contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { 832 return false; 833 } else if (contains(value, 0, 5, "CHORE")) { 834 return false; 835 } else { 836 return true; 837 } 838 } 839 840 /** 841 * Complex condition 1 for 'CH' 842 */ 843 private boolean conditionCH1(String value, int index) { 844 return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 845 3, "SCH")) || 846 contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || 847 contains(value, index + 2, 1, "T", "S") || 848 ((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && 849 (contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); 850 } 851 852 /** 853 * Complex condition 0 for 'L' 854 */ 855 private boolean conditionL0(String value, int index) { 856 if (index == value.length() - 3 && 857 contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { 858 return true; 859 } else if ((contains(value, index - 1, 2, "AS", "OS") || 860 contains(value, value.length() - 1, 1, "A", "O")) && 861 contains(value, index - 1, 4, "ALLE")) { 862 return true; 863 } else { 864 return false; 865 } 866 } 867 868 /** 869 * Complex condition 0 for 'M' 870 */ 871 private boolean conditionM0(String value, int index) { 872 if (charAt(value, index + 1) == 'M') { 873 return true; 874 } 875 return contains(value, index - 1, 3, "UMB") 876 && ((index + 1) == value.length() - 1 || contains(value, 877 index + 2, 2, "ER")); 878 } 879 880 //-- BEGIN HELPER FUNCTIONS --// 881 882 /** 883 * Determines whether or not a value is of slavo-germanic orgin. A value is 884 * of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. 885 */ 886 private boolean isSlavoGermanic(String value) { 887 return value.indexOf('W') > -1 || value.indexOf('K') > -1 || 888 value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; 889 } 890 891 /** 892 * Determines whether or not a character is a vowel or not 893 */ 894 private boolean isVowel(char ch) { 895 return VOWELS.indexOf(ch) != -1; 896 } 897 898 /** 899 * Determines whether or not the value starts with a silent letter. It will 900 * return <code>true</code> if the value starts with any of 'GN', 'KN', 901 * 'PN', 'WR' or 'PS'. 902 */ 903 private boolean isSilentStart(String value) { 904 boolean result = false; 905 for (int i = 0; i < SILENT_START.length; i++) { 906 if (value.startsWith(SILENT_START[i])) { 907 result = true; 908 break; 909 } 910 } 911 return result; 912 } 913 914 /** 915 * Cleans the input 916 */ 917 private String cleanInput(String input) { 918 if (input == null) { 919 return null; 920 } 921 input = input.trim(); 922 if (input.length() == 0) { 923 return null; 924 } 925 return input.toUpperCase(); 926 } 927 928 /** 929 * Gets the character at index <code>index</code> if available, otherwise 930 * it returns <code>Character.MIN_VALUE</code> so that there is some sort 931 * of a default 932 */ 933 protected char charAt(String value, int index) { 934 if (index < 0 || index >= value.length()) { 935 return Character.MIN_VALUE; 936 } 937 return value.charAt(index); 938 } 939 940 /** 941 * Shortcut method with 1 criteria 942 */ 943 private static boolean contains(String value, int start, int length, 944 String criteria) { 945 return contains(value, start, length, 946 new String[] { criteria }); 947 } 948 949 /** 950 * Shortcut method with 2 criteria 951 */ 952 private static boolean contains(String value, int start, int length, 953 String criteria1, String criteria2) { 954 return contains(value, start, length, 955 new String[] { criteria1, criteria2 }); 956 } 957 958 /** 959 * Shortcut method with 3 criteria 960 */ 961 private static boolean contains(String value, int start, int length, 962 String criteria1, String criteria2, 963 String criteria3) { 964 return contains(value, start, length, 965 new String[] { criteria1, criteria2, criteria3 }); 966 } 967 968 /** 969 * Shortcut method with 4 criteria 970 */ 971 private static boolean contains(String value, int start, int length, 972 String criteria1, String criteria2, 973 String criteria3, String criteria4) { 974 return contains(value, start, length, 975 new String[] { criteria1, criteria2, criteria3, 976 criteria4 }); 977 } 978 979 /** 980 * Shortcut method with 5 criteria 981 */ 982 private static boolean contains(String value, int start, int length, 983 String criteria1, String criteria2, 984 String criteria3, String criteria4, 985 String criteria5) { 986 return contains(value, start, length, 987 new String[] { criteria1, criteria2, criteria3, 988 criteria4, criteria5 }); 989 } 990 991 /** 992 * Shortcut method with 6 criteria 993 */ 994 private static boolean contains(String value, int start, int length, 995 String criteria1, String criteria2, 996 String criteria3, String criteria4, 997 String criteria5, String criteria6) { 998 return contains(value, start, length, 999 new String[] { criteria1, criteria2, criteria3, 1000 criteria4, criteria5, criteria6 }); 1001 } 1002 1003 /** 1004 * Determines whether <code>value</code> contains any of the criteria 1005 starting 1006 * at index <code>start</code> and matching up to length <code>length</code> 1007 */ 1008 protected static boolean contains(String value, int start, int length, 1009 String[] criteria) { 1010 boolean result = false; 1011 if (start >= 0 && start + length <= value.length()) { 1012 String target = value.substring(start, start + length); 1013 1014 for (int i = 0; i < criteria.length; i++) { 1015 if (target.equals(criteria[i])) { 1016 result = true; 1017 break; 1018 } 1019 } 1020 } 1021 return result; 1022 } 1023 1024 //-- BEGIN INNER CLASSES --// 1025 1026 /** 1027 * Inner class for storing results, since there is the optional alternate 1028 * encoding. 1029 */ 1030 public class DoubleMetaphoneResult { 1031 1032 private StringBuffer primary = new StringBuffer(getMaxCodeLen()); 1033 private StringBuffer alternate = new StringBuffer(getMaxCodeLen()); 1034 private int maxLength; 1035 1036 public DoubleMetaphoneResult(int maxLength) { 1037 this.maxLength = maxLength; 1038 } 1039 1040 public void append(char value) { 1041 appendPrimary(value); 1042 appendAlternate(value); 1043 } 1044 1045 public void append(char primary, char alternate) { 1046 appendPrimary(primary); 1047 appendAlternate(alternate); 1048 } 1049 1050 public void appendPrimary(char value) { 1051 if (this.primary.length() < this.maxLength) { 1052 this.primary.append(value); 1053 } 1054 } 1055 1056 public void appendAlternate(char value) { 1057 if (this.alternate.length() < this.maxLength) { 1058 this.alternate.append(value); 1059 } 1060 } 1061 1062 public void append(String value) { 1063 appendPrimary(value); 1064 appendAlternate(value); 1065 } 1066 1067 public void append(String primary, String alternate) { 1068 appendPrimary(primary); 1069 appendAlternate(alternate); 1070 } 1071 1072 public void appendPrimary(String value) { 1073 int addChars = this.maxLength - this.primary.length(); 1074 if (value.length() <= addChars) { 1075 this.primary.append(value); 1076 } else { 1077 this.primary.append(value.substring(0, addChars)); 1078 } 1079 } 1080 1081 public void appendAlternate(String value) { 1082 int addChars = this.maxLength - this.alternate.length(); 1083 if (value.length() <= addChars) { 1084 this.alternate.append(value); 1085 } else { 1086 this.alternate.append(value.substring(0, addChars)); 1087 } 1088 } 1089 1090 public String getPrimary() { 1091 return this.primary.toString(); 1092 } 1093 1094 public String getAlternate() { 1095 return this.alternate.toString(); 1096 } 1097 1098 public boolean isComplete() { 1099 return this.primary.length() >= this.maxLength && 1100 this.alternate.length() >= this.maxLength; 1101 } 1102 } 1103} 1104