1/* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16package com.android.providers.contacts; 17 18import android.content.ContentValues; 19import android.provider.ContactsContract.CommonDataKinds.StructuredName; 20import android.provider.ContactsContract.FullNameStyle; 21import android.provider.ContactsContract.PhoneticNameStyle; 22import android.text.TextUtils; 23 24import com.android.providers.contacts.util.NeededForTesting; 25 26import java.lang.Character.UnicodeBlock; 27import java.util.HashSet; 28import java.util.Locale; 29import java.util.StringTokenizer; 30 31/** 32 * The purpose of this class is to split a full name into given names and last 33 * name. The logic only supports having a single last name. If the full name has 34 * multiple last names the output will be incorrect. 35 * <p> 36 * Core algorithm: 37 * <ol> 38 * <li>Remove the suffixes (III, Ph.D., M.D.).</li> 39 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li> 40 * <li>Assign the last remaining token as the last name.</li> 41 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use 42 * this word also as the last name.</li> 43 * <li>Assign the rest of the words as the "given names".</li> 44 * </ol> 45 */ 46public class NameSplitter { 47 48 public static final int MAX_TOKENS = 10; 49 50 private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase(); 51 private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase(); 52 53 // This includes simplified and traditional Chinese 54 private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase(); 55 56 private final HashSet<String> mPrefixesSet; 57 private final HashSet<String> mSuffixesSet; 58 private final int mMaxSuffixLength; 59 private final HashSet<String> mLastNamePrefixesSet; 60 private final HashSet<String> mConjuctions; 61 private final Locale mLocale; 62 private final String mLanguage; 63 64 /** 65 * Two-Chracter long Korean family names. 66 * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1 67 */ 68 private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = { 69 "\uAC15\uC804", // Gang Jeon 70 "\uB0A8\uAD81", // Nam Goong 71 "\uB3C5\uACE0", // Dok Go 72 "\uB3D9\uBC29", // Dong Bang 73 "\uB9DD\uC808", // Mang Jeol 74 "\uC0AC\uACF5", // Sa Gong 75 "\uC11C\uBB38", // Seo Moon 76 "\uC120\uC6B0", // Seon Woo 77 "\uC18C\uBD09", // So Bong 78 "\uC5B4\uAE08", // Uh Geum 79 "\uC7A5\uACE1", // Jang Gok 80 "\uC81C\uAC08", // Je Gal 81 "\uD669\uBCF4" // Hwang Bo 82 }; 83 84 public static class Name { 85 public String prefix; 86 public String givenNames; 87 public String middleName; 88 public String familyName; 89 public String suffix; 90 91 public int fullNameStyle; 92 93 public String phoneticFamilyName; 94 public String phoneticMiddleName; 95 public String phoneticGivenName; 96 97 public int phoneticNameStyle; 98 99 public Name() { 100 } 101 102 public Name(String prefix, String givenNames, String middleName, String familyName, 103 String suffix) { 104 this.prefix = prefix; 105 this.givenNames = givenNames; 106 this.middleName = middleName; 107 this.familyName = familyName; 108 this.suffix = suffix; 109 } 110 111 @NeededForTesting 112 public String getPrefix() { 113 return prefix; 114 } 115 116 public String getGivenNames() { 117 return givenNames; 118 } 119 120 public String getMiddleName() { 121 return middleName; 122 } 123 124 public String getFamilyName() { 125 return familyName; 126 } 127 128 @NeededForTesting 129 public String getSuffix() { 130 return suffix; 131 } 132 133 public int getFullNameStyle() { 134 return fullNameStyle; 135 } 136 137 public String getPhoneticFamilyName() { 138 return phoneticFamilyName; 139 } 140 141 public String getPhoneticMiddleName() { 142 return phoneticMiddleName; 143 } 144 145 public String getPhoneticGivenName() { 146 return phoneticGivenName; 147 } 148 149 public int getPhoneticNameStyle() { 150 return phoneticNameStyle; 151 } 152 153 public void fromValues(ContentValues values) { 154 prefix = values.getAsString(StructuredName.PREFIX); 155 givenNames = values.getAsString(StructuredName.GIVEN_NAME); 156 middleName = values.getAsString(StructuredName.MIDDLE_NAME); 157 familyName = values.getAsString(StructuredName.FAMILY_NAME); 158 suffix = values.getAsString(StructuredName.SUFFIX); 159 160 Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE); 161 fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer; 162 163 phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME); 164 phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME); 165 phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME); 166 167 integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE); 168 phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer; 169 } 170 171 public void toValues(ContentValues values) { 172 putValueIfPresent(values, StructuredName.PREFIX, prefix); 173 putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames); 174 putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName); 175 putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName); 176 putValueIfPresent(values, StructuredName.SUFFIX, suffix); 177 values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle); 178 putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName); 179 putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName); 180 putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName); 181 values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle); 182 } 183 184 private void putValueIfPresent(ContentValues values, String name, String value) { 185 if (value != null) { 186 values.put(name, value); 187 } 188 } 189 190 public void clear() { 191 prefix = null; 192 givenNames = null; 193 middleName = null; 194 familyName = null; 195 suffix = null; 196 fullNameStyle = FullNameStyle.UNDEFINED; 197 phoneticFamilyName = null; 198 phoneticMiddleName = null; 199 phoneticGivenName = null; 200 phoneticNameStyle = PhoneticNameStyle.UNDEFINED; 201 } 202 203 public boolean isEmpty() { 204 return TextUtils.isEmpty(givenNames) 205 && TextUtils.isEmpty(middleName) 206 && TextUtils.isEmpty(familyName) 207 && TextUtils.isEmpty(suffix) 208 && TextUtils.isEmpty(phoneticFamilyName) 209 && TextUtils.isEmpty(phoneticMiddleName) 210 && TextUtils.isEmpty(phoneticGivenName); 211 } 212 213 @Override 214 public String toString() { 215 return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName 216 + " family: " + familyName + " suffix: " + suffix + " ph/given: " 217 + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: " 218 + phoneticFamilyName + "]"; 219 } 220 } 221 222 private static class NameTokenizer extends StringTokenizer { 223 private final String[] mTokens; 224 private int mDotBitmask; 225 private int mCommaBitmask; 226 private int mStartPointer; 227 private int mEndPointer; 228 229 public NameTokenizer(String fullName) { 230 super(fullName, " .,", true); 231 232 mTokens = new String[MAX_TOKENS]; 233 234 // Iterate over tokens, skipping over empty ones and marking tokens that 235 // are followed by dots. 236 while (hasMoreTokens() && mEndPointer < MAX_TOKENS) { 237 final String token = nextToken(); 238 if (token.length() > 0) { 239 final char c = token.charAt(0); 240 if (c == ' ') { 241 continue; 242 } 243 } 244 245 if (mEndPointer > 0 && token.charAt(0) == '.') { 246 mDotBitmask |= (1 << (mEndPointer - 1)); 247 } else if (mEndPointer > 0 && token.charAt(0) == ',') { 248 mCommaBitmask |= (1 << (mEndPointer - 1)); 249 } else { 250 mTokens[mEndPointer] = token; 251 mEndPointer++; 252 } 253 } 254 } 255 256 /** 257 * Returns true if the token is followed by a dot in the original full name. 258 */ 259 public boolean hasDot(int index) { 260 return (mDotBitmask & (1 << index)) != 0; 261 } 262 263 /** 264 * Returns true if the token is followed by a comma in the original full name. 265 */ 266 public boolean hasComma(int index) { 267 return (mCommaBitmask & (1 << index)) != 0; 268 } 269 } 270 271 /** 272 * Constructor. 273 * 274 * @param commonPrefixes comma-separated list of common prefixes, 275 * e.g. "Mr, Ms, Mrs" 276 * @param commonLastNamePrefixes comma-separated list of common last name prefixes, 277 * e.g. "d', st, st., von" 278 * @param commonSuffixes comma-separated list of common suffixes, 279 * e.g. "Jr, M.D., MD, D.D.S." 280 * @param commonConjunctions comma-separated list of common conjuctions, 281 * e.g. "AND, Or" 282 */ 283 public NameSplitter(String commonPrefixes, String commonLastNamePrefixes, 284 String commonSuffixes, String commonConjunctions, Locale locale) { 285 // TODO: refactor this to use <string-array> resources 286 mPrefixesSet = convertToSet(commonPrefixes); 287 mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes); 288 mSuffixesSet = convertToSet(commonSuffixes); 289 mConjuctions = convertToSet(commonConjunctions); 290 mLocale = locale != null ? locale : Locale.getDefault(); 291 mLanguage = mLocale.getLanguage().toLowerCase(); 292 293 int maxLength = 0; 294 for (String suffix : mSuffixesSet) { 295 if (suffix.length() > maxLength) { 296 maxLength = suffix.length(); 297 } 298 } 299 300 mMaxSuffixLength = maxLength; 301 } 302 303 /** 304 * Converts a comma-separated list of Strings to a set of Strings. Trims strings 305 * and converts them to upper case. 306 */ 307 private static HashSet<String> convertToSet(String strings) { 308 HashSet<String> set = new HashSet<String>(); 309 if (strings != null) { 310 String[] split = strings.split(","); 311 for (int i = 0; i < split.length; i++) { 312 set.add(split[i].trim().toUpperCase()); 313 } 314 } 315 return set; 316 } 317 318 /** 319 * Parses a full name and returns components as a list of tokens. 320 */ 321 public int tokenize(String[] tokens, String fullName) { 322 if (fullName == null) { 323 return 0; 324 } 325 326 NameTokenizer tokenizer = new NameTokenizer(fullName); 327 328 if (tokenizer.mStartPointer == tokenizer.mEndPointer) { 329 return 0; 330 } 331 332 String firstToken = tokenizer.mTokens[tokenizer.mStartPointer]; 333 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 334 tokenizer.mStartPointer++; 335 } 336 int count = 0; 337 for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) { 338 tokens[count++] = tokenizer.mTokens[i]; 339 } 340 341 return count; 342 } 343 344 345 /** 346 * Parses a full name and returns parsed components in the Name object. 347 */ 348 public void split(Name name, String fullName) { 349 if (fullName == null) { 350 return; 351 } 352 353 int fullNameStyle = guessFullNameStyle(fullName); 354 if (fullNameStyle == FullNameStyle.CJK) { 355 fullNameStyle = getAdjustedFullNameStyle(fullNameStyle); 356 } 357 358 split(name, fullName, fullNameStyle); 359 } 360 361 /** 362 * Parses a full name and returns parsed components in the Name object 363 * with a given fullNameStyle. 364 */ 365 public void split(Name name, String fullName, int fullNameStyle) { 366 if (fullName == null) { 367 return; 368 } 369 370 name.fullNameStyle = fullNameStyle; 371 372 switch (fullNameStyle) { 373 case FullNameStyle.CHINESE: 374 splitChineseName(name, fullName); 375 break; 376 377 case FullNameStyle.JAPANESE: 378 splitJapaneseName(name, fullName); 379 break; 380 381 case FullNameStyle.KOREAN: 382 splitKoreanName(name, fullName); 383 break; 384 385 default: 386 splitWesternName(name, fullName); 387 } 388 } 389 390 /** 391 * Splits a full name composed according to the Western tradition: 392 * <pre> 393 * [prefix] given name(s) [[middle name] family name] [, suffix] 394 * [prefix] family name, given name [middle name] [,suffix] 395 * </pre> 396 */ 397 private void splitWesternName(Name name, String fullName) { 398 NameTokenizer tokens = new NameTokenizer(fullName); 399 parsePrefix(name, tokens); 400 401 // If the name consists of just one or two tokens, treat them as first/last name, 402 // not as suffix. Example: John Ma; Ma is last name, not "M.A.". 403 if (tokens.mEndPointer > 2) { 404 parseSuffix(name, tokens); 405 } 406 407 if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) { 408 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 409 } else { 410 parseLastName(name, tokens); 411 parseMiddleName(name, tokens); 412 parseGivenNames(name, tokens); 413 } 414 } 415 416 /** 417 * Splits a full name composed according to the Chinese tradition: 418 * <pre> 419 * [family name [middle name]] given name 420 * </pre> 421 */ 422 private void splitChineseName(Name name, String fullName) { 423 StringTokenizer tokenizer = new StringTokenizer(fullName); 424 while (tokenizer.hasMoreTokens()) { 425 String token = tokenizer.nextToken(); 426 if (name.givenNames == null) { 427 name.givenNames = token; 428 } else if (name.familyName == null) { 429 name.familyName = name.givenNames; 430 name.givenNames = token; 431 } else if (name.middleName == null) { 432 name.middleName = name.givenNames; 433 name.givenNames = token; 434 } else { 435 name.middleName = name.middleName + name.givenNames; 436 name.givenNames = token; 437 } 438 } 439 440 // If a single word parse that word up. 441 if (name.givenNames != null && name.familyName == null && name.middleName == null) { 442 int length = fullName.length(); 443 if (length == 2) { 444 name.familyName = fullName.substring(0, 1); 445 name.givenNames = fullName.substring(1); 446 } else if (length == 3) { 447 name.familyName = fullName.substring(0, 1); 448 name.middleName = fullName.substring(1, 2); 449 name.givenNames = fullName.substring(2); 450 } else if (length == 4) { 451 name.familyName = fullName.substring(0, 2); 452 name.middleName = fullName.substring(2, 3); 453 name.givenNames = fullName.substring(3); 454 } 455 456 } 457 } 458 459 /** 460 * Splits a full name composed according to the Japanese tradition: 461 * <pre> 462 * [family name] given name(s) 463 * </pre> 464 */ 465 private void splitJapaneseName(Name name, String fullName) { 466 StringTokenizer tokenizer = new StringTokenizer(fullName); 467 while (tokenizer.hasMoreTokens()) { 468 String token = tokenizer.nextToken(); 469 if (name.givenNames == null) { 470 name.givenNames = token; 471 } else if (name.familyName == null) { 472 name.familyName = name.givenNames; 473 name.givenNames = token; 474 } else { 475 name.givenNames += " " + token; 476 } 477 } 478 } 479 480 /** 481 * Splits a full name composed according to the Korean tradition: 482 * <pre> 483 * [family name] given name(s) 484 * </pre> 485 */ 486 private void splitKoreanName(Name name, String fullName) { 487 StringTokenizer tokenizer = new StringTokenizer(fullName); 488 if (tokenizer.countTokens() > 1) { 489 // Each name can be identified by separators. 490 while (tokenizer.hasMoreTokens()) { 491 String token = tokenizer.nextToken(); 492 if (name.givenNames == null) { 493 name.givenNames = token; 494 } else if (name.familyName == null) { 495 name.familyName = name.givenNames; 496 name.givenNames = token; 497 } else { 498 name.givenNames += " " + token; 499 } 500 } 501 } else { 502 // There is no separator. Try to guess family name. 503 // The length of most family names is 1. 504 int familyNameLength = 1; 505 506 // Compare with 2-length family names. 507 for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) { 508 if (fullName.startsWith(twoLengthFamilyName)) { 509 familyNameLength = 2; 510 break; 511 } 512 } 513 514 name.familyName = fullName.substring(0, familyNameLength); 515 if (fullName.length() > familyNameLength) { 516 name.givenNames = fullName.substring(familyNameLength); 517 } 518 } 519 } 520 521 /** 522 * Concatenates components of a name according to the rules dictated by the name style. 523 * 524 * @param givenNameFirst is ignored for CJK display name styles 525 */ 526 public String join(Name name, boolean givenNameFirst, boolean includePrefix) { 527 String prefix = includePrefix ? name.prefix : null; 528 switch (name.fullNameStyle) { 529 case FullNameStyle.CJK: 530 case FullNameStyle.CHINESE: 531 case FullNameStyle.KOREAN: 532 return join(prefix, name.familyName, name.middleName, name.givenNames, 533 name.suffix, false, false, false); 534 535 case FullNameStyle.JAPANESE: 536 return join(prefix, name.familyName, name.middleName, name.givenNames, 537 name.suffix, true, false, false); 538 539 default: 540 if (givenNameFirst) { 541 return join(prefix, name.givenNames, name.middleName, name.familyName, 542 name.suffix, true, false, true); 543 } else { 544 return join(prefix, name.familyName, name.givenNames, name.middleName, 545 name.suffix, true, true, true); 546 } 547 } 548 } 549 550 /** 551 * Concatenates components of the phonetic name following the CJK tradition: 552 * family name + middle name + given name(s). 553 */ 554 public String joinPhoneticName(Name name) { 555 return join(null, name.phoneticFamilyName, 556 name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false); 557 } 558 559 /** 560 * Concatenates parts of a full name inserting spaces and commas as specified. 561 */ 562 private String join(String prefix, String part1, String part2, String part3, String suffix, 563 boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) { 564 prefix = prefix == null ? null: prefix.trim(); 565 part1 = part1 == null ? null: part1.trim(); 566 part2 = part2 == null ? null: part2.trim(); 567 part3 = part3 == null ? null: part3.trim(); 568 suffix = suffix == null ? null: suffix.trim(); 569 570 boolean hasPrefix = !TextUtils.isEmpty(prefix); 571 boolean hasPart1 = !TextUtils.isEmpty(part1); 572 boolean hasPart2 = !TextUtils.isEmpty(part2); 573 boolean hasPart3 = !TextUtils.isEmpty(part3); 574 boolean hasSuffix = !TextUtils.isEmpty(suffix); 575 576 boolean isSingleWord = true; 577 String singleWord = null; 578 579 if (hasPrefix) { 580 singleWord = prefix; 581 } 582 583 if (hasPart1) { 584 if (singleWord != null) { 585 isSingleWord = false; 586 } else { 587 singleWord = part1; 588 } 589 } 590 591 if (hasPart2) { 592 if (singleWord != null) { 593 isSingleWord = false; 594 } else { 595 singleWord = part2; 596 } 597 } 598 599 if (hasPart3) { 600 if (singleWord != null) { 601 isSingleWord = false; 602 } else { 603 singleWord = part3; 604 } 605 } 606 607 if (hasSuffix) { 608 if (singleWord != null) { 609 isSingleWord = false; 610 } else { 611 singleWord = normalizedSuffix(suffix); 612 } 613 } 614 615 if (isSingleWord) { 616 return singleWord; 617 } 618 619 StringBuilder sb = new StringBuilder(); 620 621 if (hasPrefix) { 622 sb.append(prefix); 623 } 624 625 if (hasPart1) { 626 if (hasPrefix) { 627 sb.append(' '); 628 } 629 sb.append(part1); 630 } 631 632 if (hasPart2) { 633 if (hasPrefix || hasPart1) { 634 if (useCommaAfterPart1) { 635 sb.append(','); 636 } 637 if (useSpace) { 638 sb.append(' '); 639 } 640 } 641 sb.append(part2); 642 } 643 644 if (hasPart3) { 645 if (hasPrefix || hasPart1 || hasPart2) { 646 if (useSpace) { 647 sb.append(' '); 648 } 649 } 650 sb.append(part3); 651 } 652 653 if (hasSuffix) { 654 if (hasPrefix || hasPart1 || hasPart2 || hasPart3) { 655 if (useCommaAfterPart3) { 656 sb.append(','); 657 } 658 if (useSpace) { 659 sb.append(' '); 660 } 661 } 662 sb.append(normalizedSuffix(suffix)); 663 } 664 665 return sb.toString(); 666 } 667 668 /** 669 * Puts a dot after the supplied suffix if that is the accepted form of the suffix, 670 * e.g. "Jr." and "Sr.", but not "I", "II" and "III". 671 */ 672 private String normalizedSuffix(String suffix) { 673 int length = suffix.length(); 674 if (length == 0 || suffix.charAt(length - 1) == '.') { 675 return suffix; 676 } 677 678 String withDot = suffix + '.'; 679 if (mSuffixesSet.contains(withDot.toUpperCase())) { 680 return withDot; 681 } else { 682 return suffix; 683 } 684 } 685 686 /** 687 * If the supplied name style is undefined, returns a default based on the language, 688 * otherwise returns the supplied name style itself. 689 * 690 * @param nameStyle See {@link FullNameStyle}. 691 */ 692 public int getAdjustedFullNameStyle(int nameStyle) { 693 if (nameStyle == FullNameStyle.UNDEFINED) { 694 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 695 return FullNameStyle.JAPANESE; 696 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 697 return FullNameStyle.KOREAN; 698 } else if (CHINESE_LANGUAGE.equals(mLanguage)) { 699 return FullNameStyle.CHINESE; 700 } else { 701 return FullNameStyle.WESTERN; 702 } 703 } else if (nameStyle == FullNameStyle.CJK) { 704 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 705 return FullNameStyle.JAPANESE; 706 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 707 return FullNameStyle.KOREAN; 708 } else { 709 return FullNameStyle.CHINESE; 710 } 711 } 712 return nameStyle; 713 } 714 715 /** 716 * Parses the first word from the name if it is a prefix. 717 */ 718 private void parsePrefix(Name name, NameTokenizer tokens) { 719 if (tokens.mStartPointer == tokens.mEndPointer) { 720 return; 721 } 722 723 String firstToken = tokens.mTokens[tokens.mStartPointer]; 724 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 725 if (tokens.hasDot(tokens.mStartPointer)) { 726 firstToken += '.'; 727 } 728 name.prefix = firstToken; 729 tokens.mStartPointer++; 730 } 731 } 732 733 /** 734 * Parses the last word(s) from the name if it is a suffix. 735 */ 736 private void parseSuffix(Name name, NameTokenizer tokens) { 737 if (tokens.mStartPointer == tokens.mEndPointer) { 738 return; 739 } 740 741 String lastToken = tokens.mTokens[tokens.mEndPointer - 1]; 742 743 // Take care of an explicit comma-separated suffix 744 if (tokens.mEndPointer - tokens.mStartPointer > 2 745 && tokens.hasComma(tokens.mEndPointer - 2)) { 746 if (tokens.hasDot(tokens.mEndPointer - 1)) { 747 lastToken += '.'; 748 } 749 name.suffix = lastToken; 750 tokens.mEndPointer--; 751 return; 752 } 753 754 if (lastToken.length() > mMaxSuffixLength) { 755 return; 756 } 757 758 String normalized = lastToken.toUpperCase(); 759 if (mSuffixesSet.contains(normalized)) { 760 name.suffix = lastToken; 761 tokens.mEndPointer--; 762 return; 763 } 764 765 if (tokens.hasDot(tokens.mEndPointer - 1)) { 766 lastToken += '.'; 767 } 768 normalized += "."; 769 770 // Take care of suffixes like M.D. and D.D.S. 771 int pos = tokens.mEndPointer - 1; 772 while (normalized.length() <= mMaxSuffixLength) { 773 774 if (mSuffixesSet.contains(normalized)) { 775 name.suffix = lastToken; 776 tokens.mEndPointer = pos; 777 return; 778 } 779 780 if (pos == tokens.mStartPointer) { 781 break; 782 } 783 784 pos--; 785 if (tokens.hasDot(pos)) { 786 lastToken = tokens.mTokens[pos] + "." + lastToken; 787 } else { 788 lastToken = tokens.mTokens[pos] + " " + lastToken; 789 } 790 791 normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized; 792 } 793 } 794 795 private void parseLastName(Name name, NameTokenizer tokens) { 796 if (tokens.mStartPointer == tokens.mEndPointer) { 797 return; 798 } 799 800 // If the first word is followed by a comma, assume that it's the family name 801 if (tokens.hasComma(tokens.mStartPointer)) { 802 name.familyName = tokens.mTokens[tokens.mStartPointer]; 803 tokens.mStartPointer++; 804 return; 805 } 806 807 // If the second word is followed by a comma and the first word 808 // is a last name prefix as in "de Sade" and "von Cliburn", treat 809 // the first two words as the family name. 810 if (tokens.mStartPointer + 1 < tokens.mEndPointer 811 && tokens.hasComma(tokens.mStartPointer + 1) 812 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) { 813 String familyNamePrefix = tokens.mTokens[tokens.mStartPointer]; 814 if (tokens.hasDot(tokens.mStartPointer)) { 815 familyNamePrefix += '.'; 816 } 817 name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1]; 818 tokens.mStartPointer += 2; 819 return; 820 } 821 822 // Finally, assume that the last word is the last name 823 name.familyName = tokens.mTokens[tokens.mEndPointer - 1]; 824 tokens.mEndPointer--; 825 826 // Take care of last names like "de Sade" and "von Cliburn" 827 if ((tokens.mEndPointer - tokens.mStartPointer) > 0) { 828 String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1]; 829 if (isFamilyNamePrefix(lastNamePrefix)) { 830 if (tokens.hasDot(tokens.mEndPointer - 1)) { 831 lastNamePrefix += '.'; 832 } 833 name.familyName = lastNamePrefix + " " + name.familyName; 834 tokens.mEndPointer--; 835 } 836 } 837 } 838 839 /** 840 * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de" 841 */ 842 private boolean isFamilyNamePrefix(String word) { 843 final String normalized = word.toUpperCase(); 844 845 return mLastNamePrefixesSet.contains(normalized) 846 || mLastNamePrefixesSet.contains(normalized + "."); 847 } 848 849 850 private void parseMiddleName(Name name, NameTokenizer tokens) { 851 if (tokens.mStartPointer == tokens.mEndPointer) { 852 return; 853 } 854 855 if ((tokens.mEndPointer - tokens.mStartPointer) > 1) { 856 if ((tokens.mEndPointer - tokens.mStartPointer) == 2 857 || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2]. 858 toUpperCase())) { 859 name.middleName = tokens.mTokens[tokens.mEndPointer - 1]; 860 if (tokens.hasDot(tokens.mEndPointer - 1)) { 861 name.middleName += '.'; 862 } 863 tokens.mEndPointer--; 864 } 865 } 866 } 867 868 private void parseGivenNames(Name name, NameTokenizer tokens) { 869 if (tokens.mStartPointer == tokens.mEndPointer) { 870 return; 871 } 872 873 if ((tokens.mEndPointer - tokens.mStartPointer) == 1) { 874 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 875 } else { 876 StringBuilder sb = new StringBuilder(); 877 for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) { 878 if (i != tokens.mStartPointer) { 879 sb.append(' '); 880 } 881 sb.append(tokens.mTokens[i]); 882 if (tokens.hasDot(i)) { 883 sb.append('.'); 884 } 885 } 886 name.givenNames = sb.toString(); 887 } 888 } 889 890 /** 891 * Makes the best guess at the expected full name style based on the character set 892 * used in the supplied name. If the phonetic name is also supplied, tries to 893 * differentiate between Chinese, Japanese and Korean based on the alphabet used 894 * for the phonetic name. 895 */ 896 public void guessNameStyle(Name name) { 897 guessFullNameStyle(name); 898 guessPhoneticNameStyle(name); 899 name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle, 900 name.phoneticNameStyle); 901 } 902 903 /** 904 * Updates the display name style according to the phonetic name style if we 905 * were unsure about display name style based on the name components, but 906 * phonetic name makes it more definitive. 907 */ 908 public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) { 909 if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 910 if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) { 911 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) { 912 return FullNameStyle.JAPANESE; 913 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) { 914 return FullNameStyle.KOREAN; 915 } 916 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) { 917 return FullNameStyle.CHINESE; 918 } 919 } 920 } 921 return nameStyle; 922 } 923 924 /** 925 * Makes the best guess at the expected full name style based on the character set 926 * used in the supplied name. 927 */ 928 private void guessFullNameStyle(NameSplitter.Name name) { 929 if (name.fullNameStyle != FullNameStyle.UNDEFINED) { 930 return; 931 } 932 933 int bestGuess = guessFullNameStyle(name.givenNames); 934 // A mix of Hanzi and latin chars are common in China, so we have to go through all names 935 // if the name is not JANPANESE or KOREAN. 936 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK 937 && bestGuess != FullNameStyle.WESTERN) { 938 name.fullNameStyle = bestGuess; 939 return; 940 } 941 942 int guess = guessFullNameStyle(name.familyName); 943 if (guess != FullNameStyle.UNDEFINED) { 944 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 945 name.fullNameStyle = guess; 946 return; 947 } 948 bestGuess = guess; 949 } 950 951 guess = guessFullNameStyle(name.middleName); 952 if (guess != FullNameStyle.UNDEFINED) { 953 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 954 name.fullNameStyle = guess; 955 return; 956 } 957 bestGuess = guess; 958 } 959 960 guess = guessFullNameStyle(name.prefix); 961 if (guess != FullNameStyle.UNDEFINED) { 962 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 963 name.fullNameStyle = guess; 964 return; 965 } 966 bestGuess = guess; 967 } 968 969 guess = guessFullNameStyle(name.suffix); 970 if (guess != FullNameStyle.UNDEFINED) { 971 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 972 name.fullNameStyle = guess; 973 return; 974 } 975 bestGuess = guess; 976 } 977 978 name.fullNameStyle = bestGuess; 979 } 980 981 public int guessFullNameStyle(String name) { 982 if (name == null) { 983 return FullNameStyle.UNDEFINED; 984 } 985 986 int nameStyle = FullNameStyle.UNDEFINED; 987 int length = name.length(); 988 int offset = 0; 989 while (offset < length) { 990 int codePoint = Character.codePointAt(name, offset); 991 if (Character.isLetter(codePoint)) { 992 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 993 994 if (!isLatinUnicodeBlock(unicodeBlock)) { 995 996 if (isCJKUnicodeBlock(unicodeBlock)) { 997 // We don't know if this is Chinese, Japanese or Korean - 998 // trying to figure out by looking at other characters in the name 999 return guessCJKNameStyle(name, offset + Character.charCount(codePoint)); 1000 } 1001 1002 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1003 return FullNameStyle.JAPANESE; 1004 } 1005 1006 if (isKoreanUnicodeBlock(unicodeBlock)) { 1007 return FullNameStyle.KOREAN; 1008 } 1009 } 1010 nameStyle = FullNameStyle.WESTERN; 1011 } 1012 offset += Character.charCount(codePoint); 1013 } 1014 return nameStyle; 1015 } 1016 1017 private int guessCJKNameStyle(String name, int offset) { 1018 int length = name.length(); 1019 while (offset < length) { 1020 int codePoint = Character.codePointAt(name, offset); 1021 if (Character.isLetter(codePoint)) { 1022 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 1023 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1024 return FullNameStyle.JAPANESE; 1025 } 1026 if (isKoreanUnicodeBlock(unicodeBlock)) { 1027 return FullNameStyle.KOREAN; 1028 } 1029 } 1030 offset += Character.charCount(codePoint); 1031 } 1032 1033 return FullNameStyle.CJK; 1034 } 1035 1036 private void guessPhoneticNameStyle(NameSplitter.Name name) { 1037 if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 1038 return; 1039 } 1040 1041 int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName); 1042 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) { 1043 name.phoneticNameStyle = bestGuess; 1044 return; 1045 } 1046 1047 int guess = guessPhoneticNameStyle(name.phoneticGivenName); 1048 if (guess != FullNameStyle.UNDEFINED) { 1049 if (guess != FullNameStyle.CJK) { 1050 name.phoneticNameStyle = guess; 1051 return; 1052 } 1053 bestGuess = guess; 1054 } 1055 1056 guess = guessPhoneticNameStyle(name.phoneticMiddleName); 1057 if (guess != FullNameStyle.UNDEFINED) { 1058 if (guess != FullNameStyle.CJK) { 1059 name.phoneticNameStyle = guess; 1060 return; 1061 } 1062 bestGuess = guess; 1063 } 1064 } 1065 1066 public int guessPhoneticNameStyle(String name) { 1067 if (name == null) { 1068 return PhoneticNameStyle.UNDEFINED; 1069 } 1070 1071 int nameStyle = PhoneticNameStyle.UNDEFINED; 1072 int length = name.length(); 1073 int offset = 0; 1074 while (offset < length) { 1075 int codePoint = Character.codePointAt(name, offset); 1076 if (Character.isLetter(codePoint)) { 1077 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 1078 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 1079 return PhoneticNameStyle.JAPANESE; 1080 } 1081 if (isKoreanUnicodeBlock(unicodeBlock)) { 1082 return PhoneticNameStyle.KOREAN; 1083 } 1084 if (isLatinUnicodeBlock(unicodeBlock)) { 1085 return PhoneticNameStyle.PINYIN; 1086 } 1087 } 1088 offset += Character.charCount(codePoint); 1089 } 1090 1091 return nameStyle; 1092 } 1093 1094 private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) { 1095 return unicodeBlock == UnicodeBlock.BASIC_LATIN || 1096 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT || 1097 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A || 1098 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B || 1099 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL; 1100 } 1101 1102 private static boolean isCJKUnicodeBlock(UnicodeBlock block) { 1103 return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 1104 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1105 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1106 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION 1107 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT 1108 || block == UnicodeBlock.CJK_COMPATIBILITY 1109 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS 1110 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 1111 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT; 1112 } 1113 1114 private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) { 1115 return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES || 1116 unicodeBlock == UnicodeBlock.HANGUL_JAMO || 1117 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO; 1118 } 1119 1120 private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) { 1121 return unicodeBlock == UnicodeBlock.KATAKANA || 1122 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || 1123 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || 1124 unicodeBlock == UnicodeBlock.HIRAGANA; 1125 } 1126} 1127