NameSplitter.java revision cdd03b2ba03718a7fa85663a2438136284a1557c
1/* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16package com.android.providers.contacts; 17 18import com.android.internal.util.HanziToPinyin; 19import com.android.internal.util.HanziToPinyin.Token; 20 21import android.content.ContentValues; 22import android.provider.ContactsContract.FullNameStyle; 23import android.provider.ContactsContract.PhoneticNameStyle; 24import android.provider.ContactsContract.CommonDataKinds.StructuredName; 25import android.text.TextUtils; 26 27import java.lang.Character.UnicodeBlock; 28import java.util.ArrayList; 29import java.util.HashSet; 30import java.util.Locale; 31import java.util.StringTokenizer; 32 33/** 34 * The purpose of this class is to split a full name into given names and last 35 * name. The logic only supports having a single last name. If the full name has 36 * multiple last names the output will be incorrect. 37 * <p> 38 * Core algorithm: 39 * <ol> 40 * <li>Remove the suffixes (III, Ph.D., M.D.).</li> 41 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li> 42 * <li>Assign the last remaining token as the last name.</li> 43 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use 44 * this word also as the last name.</li> 45 * <li>Assign the rest of the words as the "given names".</li> 46 * </ol> 47 */ 48public class NameSplitter { 49 50 public static final int MAX_TOKENS = 10; 51 52 private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase(); 53 private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase(); 54 55 // This includes simplified and traditional Chinese 56 private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase(); 57 58 private final HashSet<String> mPrefixesSet; 59 private final HashSet<String> mSuffixesSet; 60 private final int mMaxSuffixLength; 61 private final HashSet<String> mLastNamePrefixesSet; 62 private final HashSet<String> mConjuctions; 63 private final Locale mLocale; 64 private final String mLanguage; 65 66 public static class Name { 67 public String prefix; 68 public String givenNames; 69 public String middleName; 70 public String familyName; 71 public String suffix; 72 73 public int fullNameStyle; 74 75 public String phoneticFamilyName; 76 public String phoneticMiddleName; 77 public String phoneticGivenName; 78 79 public int phoneticNameStyle; 80 81 public Name() { 82 } 83 84 public Name(String prefix, String givenNames, String middleName, String familyName, 85 String suffix) { 86 this.prefix = prefix; 87 this.givenNames = givenNames; 88 this.middleName = middleName; 89 this.familyName = familyName; 90 this.suffix = suffix; 91 } 92 93 public String getPrefix() { 94 return prefix; 95 } 96 97 public String getGivenNames() { 98 return givenNames; 99 } 100 101 public String getMiddleName() { 102 return middleName; 103 } 104 105 public String getFamilyName() { 106 return familyName; 107 } 108 109 public String getSuffix() { 110 return suffix; 111 } 112 113 public int getFullNameStyle() { 114 return fullNameStyle; 115 } 116 117 public String getPhoneticFamilyName() { 118 return phoneticFamilyName; 119 } 120 121 public String getPhoneticMiddleName() { 122 return phoneticMiddleName; 123 } 124 125 public String getPhoneticGivenName() { 126 return phoneticGivenName; 127 } 128 129 public int getPhoneticNameStyle() { 130 return phoneticNameStyle; 131 } 132 133 public void fromValues(ContentValues values) { 134 prefix = values.getAsString(StructuredName.PREFIX); 135 givenNames = values.getAsString(StructuredName.GIVEN_NAME); 136 middleName = values.getAsString(StructuredName.MIDDLE_NAME); 137 familyName = values.getAsString(StructuredName.FAMILY_NAME); 138 suffix = values.getAsString(StructuredName.SUFFIX); 139 140 Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE); 141 fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer; 142 143 phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME); 144 phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME); 145 phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME); 146 147 integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE); 148 phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer; 149 } 150 151 public void toValues(ContentValues values) { 152 putValueIfPresent(values, StructuredName.PREFIX, prefix); 153 putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames); 154 putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName); 155 putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName); 156 putValueIfPresent(values, StructuredName.SUFFIX, suffix); 157 values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle); 158 putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName); 159 putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName); 160 putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName); 161 values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle); 162 } 163 164 private void putValueIfPresent(ContentValues values, String name, String value) { 165 if (value != null) { 166 values.put(name, value); 167 } 168 } 169 170 public void clear() { 171 prefix = null; 172 givenNames = null; 173 middleName = null; 174 familyName = null; 175 suffix = null; 176 fullNameStyle = FullNameStyle.UNDEFINED; 177 phoneticFamilyName = null; 178 phoneticMiddleName = null; 179 phoneticGivenName = null; 180 phoneticNameStyle = PhoneticNameStyle.UNDEFINED; 181 } 182 183 public boolean isEmpty() { 184 return TextUtils.isEmpty(givenNames) 185 && TextUtils.isEmpty(middleName) 186 && TextUtils.isEmpty(familyName) 187 && TextUtils.isEmpty(suffix) 188 && TextUtils.isEmpty(phoneticFamilyName) 189 && TextUtils.isEmpty(phoneticMiddleName) 190 && TextUtils.isEmpty(phoneticGivenName); 191 } 192 193 @Override 194 public String toString() { 195 return "[given: " + givenNames + " middle: " + middleName + " family: " + familyName 196 + " ph/given: " + phoneticGivenName + " ph/middle: " + phoneticMiddleName 197 + " ph/family: " + phoneticFamilyName + "]"; 198 } 199 200 } 201 202 private static class NameTokenizer extends StringTokenizer { 203 private final String[] mTokens; 204 private int mDotBitmask; 205 private int mCommaBitmask; 206 private int mStartPointer; 207 private int mEndPointer; 208 209 public NameTokenizer(String fullName) { 210 super(fullName, " .,", true); 211 212 mTokens = new String[MAX_TOKENS]; 213 214 // Iterate over tokens, skipping over empty ones and marking tokens that 215 // are followed by dots. 216 while (hasMoreTokens() && mEndPointer < MAX_TOKENS) { 217 final String token = nextToken(); 218 if (token.length() > 0) { 219 final char c = token.charAt(0); 220 if (c == ' ') { 221 continue; 222 } 223 } 224 225 if (mEndPointer > 0 && token.charAt(0) == '.') { 226 mDotBitmask |= (1 << (mEndPointer - 1)); 227 } else if (mEndPointer > 0 && token.charAt(0) == ',') { 228 mCommaBitmask |= (1 << (mEndPointer - 1)); 229 } else { 230 mTokens[mEndPointer] = token; 231 mEndPointer++; 232 } 233 } 234 } 235 236 /** 237 * Returns true if the token is followed by a dot in the original full name. 238 */ 239 public boolean hasDot(int index) { 240 return (mDotBitmask & (1 << index)) != 0; 241 } 242 243 /** 244 * Returns true if the token is followed by a comma in the original full name. 245 */ 246 public boolean hasComma(int index) { 247 return (mCommaBitmask & (1 << index)) != 0; 248 } 249 } 250 251 /** 252 * Constructor. 253 * 254 * @param commonPrefixes comma-separated list of common prefixes, 255 * e.g. "Mr, Ms, Mrs" 256 * @param commonLastNamePrefixes comma-separated list of common last name prefixes, 257 * e.g. "d', st, st., von" 258 * @param commonSuffixes comma-separated list of common suffixes, 259 * e.g. "Jr, M.D., MD, D.D.S." 260 * @param commonConjunctions comma-separated list of common conjuctions, 261 * e.g. "AND, Or" 262 */ 263 public NameSplitter(String commonPrefixes, String commonLastNamePrefixes, 264 String commonSuffixes, String commonConjunctions, Locale locale) { 265 // TODO: refactor this to use <string-array> resources 266 mPrefixesSet = convertToSet(commonPrefixes); 267 mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes); 268 mSuffixesSet = convertToSet(commonSuffixes); 269 mConjuctions = convertToSet(commonConjunctions); 270 mLocale = locale != null ? locale : Locale.getDefault(); 271 mLanguage = mLocale.getLanguage().toLowerCase(); 272 273 int maxLength = 0; 274 for (String suffix : mSuffixesSet) { 275 if (suffix.length() > maxLength) { 276 maxLength = suffix.length(); 277 } 278 } 279 280 mMaxSuffixLength = maxLength; 281 } 282 283 /** 284 * Converts a comma-separated list of Strings to a set of Strings. Trims strings 285 * and converts them to upper case. 286 */ 287 private static HashSet<String> convertToSet(String strings) { 288 HashSet<String> set = new HashSet<String>(); 289 if (strings != null) { 290 String[] split = strings.split(","); 291 for (int i = 0; i < split.length; i++) { 292 set.add(split[i].trim().toUpperCase()); 293 } 294 } 295 return set; 296 } 297 298 /** 299 * Parses a full name and returns components as a list of tokens. 300 */ 301 public int tokenize(String[] tokens, String fullName) { 302 if (fullName == null) { 303 return 0; 304 } 305 306 NameTokenizer tokenizer = new NameTokenizer(fullName); 307 308 if (tokenizer.mStartPointer == tokenizer.mEndPointer) { 309 return 0; 310 } 311 312 String firstToken = tokenizer.mTokens[tokenizer.mStartPointer]; 313 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 314 tokenizer.mStartPointer++; 315 } 316 int count = 0; 317 for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) { 318 tokens[count++] = tokenizer.mTokens[i]; 319 } 320 321 return count; 322 } 323 324 325 /** 326 * Parses a full name and returns parsed components in the Name object. 327 */ 328 public void split(Name name, String fullName) { 329 if (fullName == null) { 330 return; 331 } 332 333 int fullNameStyle = guessFullNameStyle(fullName); 334 if (fullNameStyle == FullNameStyle.CJK) { 335 fullNameStyle = getAdjustedFullNameStyle(fullNameStyle); 336 } 337 338 name.fullNameStyle = fullNameStyle; 339 340 switch (fullNameStyle) { 341 case FullNameStyle.CHINESE: 342 splitChineseName(name, fullName); 343 break; 344 345 case FullNameStyle.JAPANESE: 346 case FullNameStyle.KOREAN: 347 splitJapaneseOrKoreanName(name, fullName); 348 break; 349 350 default: 351 splitWesternName(name, fullName); 352 } 353 } 354 355 /** 356 * Splits a full name composed according to the Western tradition: 357 * <pre> 358 * [prefix] given name(s) [[middle name] family name] [, suffix] 359 * [prefix] family name, given name [middle name] [,suffix] 360 * </pre> 361 */ 362 private void splitWesternName(Name name, String fullName) { 363 NameTokenizer tokens = new NameTokenizer(fullName); 364 parsePrefix(name, tokens); 365 366 // If the name consists of just one or two tokens, treat them as first/last name, 367 // not as suffix. Example: John Ma; Ma is last name, not "M.A.". 368 if (tokens.mEndPointer > 2) { 369 parseSuffix(name, tokens); 370 } 371 372 if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) { 373 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 374 } else { 375 parseLastName(name, tokens); 376 parseMiddleName(name, tokens); 377 parseGivenNames(name, tokens); 378 } 379 } 380 381 /** 382 * Splits a full name composed according to the Chinese tradition: 383 * <pre> 384 * [family name [middle name]] given name 385 * </pre> 386 */ 387 private void splitChineseName(Name name, String fullName) { 388 StringTokenizer tokenizer = new StringTokenizer(fullName); 389 while (tokenizer.hasMoreTokens()) { 390 String token = tokenizer.nextToken(); 391 if (name.givenNames == null) { 392 name.givenNames = token; 393 } else if (name.familyName == null) { 394 name.familyName = name.givenNames; 395 name.givenNames = token; 396 } else if (name.middleName == null) { 397 name.middleName = name.givenNames; 398 name.givenNames = token; 399 } else { 400 name.middleName = name.middleName + name.givenNames; 401 name.givenNames = token; 402 } 403 } 404 405 // If a single word parse that word up. 406 if (name.givenNames != null && name.familyName == null && name.middleName == null) { 407 int length = fullName.length(); 408 if (length == 2) { 409 name.familyName = fullName.substring(0, 1); 410 name.givenNames = fullName.substring(1); 411 } else if (length == 3) { 412 name.familyName = fullName.substring(0, 1); 413 name.middleName = fullName.substring(1, 2); 414 name.givenNames = fullName.substring(2); 415 } else if (length == 4) { 416 name.familyName = fullName.substring(0, 2); 417 name.middleName = fullName.substring(2, 3); 418 name.givenNames = fullName.substring(3); 419 } 420 421 } 422 } 423 424 /** 425 * Splits a full name composed according to the Japanese tradition: 426 * <pre> 427 * [family name] given name(s) 428 * </pre> 429 */ 430 private void splitJapaneseOrKoreanName(Name name, String fullName) { 431 StringTokenizer tokenizer = new StringTokenizer(fullName); 432 while (tokenizer.hasMoreTokens()) { 433 String token = tokenizer.nextToken(); 434 if (name.givenNames == null) { 435 name.givenNames = token; 436 } else if (name.familyName == null) { 437 name.familyName = name.givenNames; 438 name.givenNames = token; 439 } else { 440 name.givenNames += " " + token; 441 } 442 } 443 } 444 445 /** 446 * Concatenates components of a name according to the rules dictated by the name style. 447 * 448 * @param givenNameFirst is ignored for CJK display name styles 449 */ 450 public String join(Name name, boolean givenNameFirst) { 451 switch (name.fullNameStyle) { 452 case FullNameStyle.CJK: 453 case FullNameStyle.CHINESE: 454 case FullNameStyle.KOREAN: 455 return join(name.familyName, name.middleName, name.givenNames, name.suffix, 456 false, false, false); 457 458 case FullNameStyle.JAPANESE: 459 return join(name.familyName, name.middleName, name.givenNames, name.suffix, 460 true, false, false); 461 462 default: 463 if (givenNameFirst) { 464 return join(name.givenNames, name.middleName, name.familyName, name.suffix, 465 true, false, true); 466 } else { 467 return join(name.familyName, name.givenNames, name.middleName, name.suffix, 468 true, true, true); 469 } 470 } 471 } 472 473 /** 474 * Concatenates components of the phonetic name following the CJK tradition: 475 * family name + middle name + given name(s). 476 */ 477 public String joinPhoneticName(Name name) { 478 return join(name.phoneticFamilyName, name.phoneticMiddleName, 479 name.phoneticGivenName, null, true, false, false); 480 } 481 482 /** 483 * Given a name in Chinese, returns a Pinyin representation. 484 */ 485 public String convertHanziToPinyin(String name) { 486 487 // TODO: move this code to HanziToPinyin and optimize 488 ArrayList<Token> tokens = HanziToPinyin.getInstance().get(name); 489 if (tokens != null) { 490 int size = tokens.size(); 491 if (size != 0) { 492 StringBuilder sb = new StringBuilder(); 493 for (int i = 0; i < size; i++) { 494 String pinyin = tokens.get(i).target; 495 if (!TextUtils.isEmpty(pinyin)) { 496 if (sb.length() != 0) { 497 sb.append(' '); 498 } 499 sb.append(pinyin); 500 } 501 } 502 return sb.toString(); 503 } 504 } 505 return null; 506 } 507 508 /** 509 * Concatenates parts of a full name inserting spaces and commas as specified. 510 */ 511 private String join(String part1, String part2, String part3, String suffix, 512 boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) { 513 boolean hasPart1 = !TextUtils.isEmpty(part1); 514 boolean hasPart2 = !TextUtils.isEmpty(part2); 515 boolean hasPart3 = !TextUtils.isEmpty(part3); 516 boolean hasSuffix = !TextUtils.isEmpty(suffix); 517 518 boolean isSingleWord = true; 519 String singleWord = null; 520 if (hasPart1) { 521 singleWord = part1; 522 } 523 524 if (hasPart2) { 525 if (singleWord != null) { 526 isSingleWord = false; 527 } else { 528 singleWord = part2; 529 } 530 } 531 532 if (hasPart3) { 533 if (singleWord != null) { 534 isSingleWord = false; 535 } else { 536 singleWord = part3; 537 } 538 } 539 540 if (hasSuffix) { 541 if (singleWord != null) { 542 isSingleWord = false; 543 } else { 544 singleWord = normalizedSuffix(suffix); 545 } 546 } 547 548 if (isSingleWord) { 549 return singleWord; 550 } 551 552 StringBuilder sb = new StringBuilder(); 553 if (hasPart1) { 554 sb.append(part1); 555 } 556 557 if (hasPart2) { 558 if (hasPart1) { 559 if (useCommaAfterPart1) { 560 sb.append(','); 561 } 562 if (useSpace) { 563 sb.append(' '); 564 } 565 } 566 sb.append(part2); 567 } 568 569 if (hasPart3) { 570 if (hasPart1 || hasPart2) { 571 if (useSpace) { 572 sb.append(' '); 573 } 574 } 575 sb.append(part3); 576 } 577 578 if (hasSuffix) { 579 if (hasPart1 || hasPart2 || hasPart3) { 580 if (useCommaAfterPart3) { 581 sb.append(','); 582 } 583 if (useSpace) { 584 sb.append(' '); 585 } 586 } 587 sb.append(normalizedSuffix(suffix)); 588 } 589 590 return sb.toString(); 591 } 592 593 /** 594 * Puts a dot after the supplied suffix if that is the accepted form of the suffix, 595 * e.g. "Jr." and "Sr.", but not "I", "II" and "III". 596 */ 597 private String normalizedSuffix(String suffix) { 598 int length = suffix.length(); 599 if (length == 0 || suffix.charAt(length - 1) == '.') { 600 return suffix; 601 } 602 603 String withDot = suffix + '.'; 604 if (mSuffixesSet.contains(withDot.toUpperCase())) { 605 return withDot; 606 } else { 607 return suffix; 608 } 609 } 610 611 /** 612 * If the supplied name style is undefined, returns a default based on the language, 613 * otherwise returns the supplied name style itself. 614 * 615 * @param nameStyle See {@link FullNameStyle}. 616 */ 617 public int getAdjustedFullNameStyle(int nameStyle) { 618 if (nameStyle == FullNameStyle.UNDEFINED) { 619 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 620 return FullNameStyle.JAPANESE; 621 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 622 return FullNameStyle.KOREAN; 623 } else if (CHINESE_LANGUAGE.equals(mLanguage)) { 624 return FullNameStyle.CHINESE; 625 } else { 626 return FullNameStyle.WESTERN; 627 } 628 } else if (nameStyle == FullNameStyle.CJK) { 629 if (JAPANESE_LANGUAGE.equals(mLanguage)) { 630 return FullNameStyle.JAPANESE; 631 } else if (KOREAN_LANGUAGE.equals(mLanguage)) { 632 return FullNameStyle.KOREAN; 633 } else { 634 return FullNameStyle.CHINESE; 635 } 636 } 637 return nameStyle; 638 } 639 640 /** 641 * Parses the first word from the name if it is a prefix. 642 */ 643 private void parsePrefix(Name name, NameTokenizer tokens) { 644 if (tokens.mStartPointer == tokens.mEndPointer) { 645 return; 646 } 647 648 String firstToken = tokens.mTokens[tokens.mStartPointer]; 649 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 650 name.prefix = firstToken; 651 tokens.mStartPointer++; 652 } 653 } 654 655 /** 656 * Parses the last word(s) from the name if it is a suffix. 657 */ 658 private void parseSuffix(Name name, NameTokenizer tokens) { 659 if (tokens.mStartPointer == tokens.mEndPointer) { 660 return; 661 } 662 663 String lastToken = tokens.mTokens[tokens.mEndPointer - 1]; 664 if (lastToken.length() > mMaxSuffixLength) { 665 return; 666 } 667 668 String normalized = lastToken.toUpperCase(); 669 if (mSuffixesSet.contains(normalized)) { 670 name.suffix = lastToken; 671 tokens.mEndPointer--; 672 return; 673 } 674 675 if (tokens.hasDot(tokens.mEndPointer - 1)) { 676 lastToken += '.'; 677 } 678 normalized += "."; 679 680 // Take care of suffixes like M.D. and D.D.S. 681 int pos = tokens.mEndPointer - 1; 682 while (normalized.length() <= mMaxSuffixLength) { 683 684 if (mSuffixesSet.contains(normalized)) { 685 name.suffix = lastToken; 686 tokens.mEndPointer = pos; 687 return; 688 } 689 690 if (pos == tokens.mStartPointer) { 691 break; 692 } 693 694 pos--; 695 if (tokens.hasDot(pos)) { 696 lastToken = tokens.mTokens[pos] + "." + lastToken; 697 } else { 698 lastToken = tokens.mTokens[pos] + " " + lastToken; 699 } 700 701 normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized; 702 } 703 } 704 705 private void parseLastName(Name name, NameTokenizer tokens) { 706 if (tokens.mStartPointer == tokens.mEndPointer) { 707 return; 708 } 709 710 // If the first word is followed by a comma, assume that it's the family name 711 if (tokens.hasComma(tokens.mStartPointer)) { 712 name.familyName = tokens.mTokens[tokens.mStartPointer]; 713 tokens.mStartPointer++; 714 return; 715 } 716 717 // If the second word is followed by a comma and the first word 718 // is a last name prefix as in "de Sade" and "von Cliburn", treat 719 // the first two words as the family name. 720 if (tokens.mStartPointer + 1 < tokens.mEndPointer 721 && tokens.hasComma(tokens.mStartPointer + 1) 722 && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) { 723 String familyNamePrefix = tokens.mTokens[tokens.mStartPointer]; 724 if (tokens.hasDot(tokens.mStartPointer)) { 725 familyNamePrefix += '.'; 726 } 727 name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1]; 728 tokens.mStartPointer += 2; 729 return; 730 } 731 732 // Finally, assume that the last word is the last name 733 name.familyName = tokens.mTokens[tokens.mEndPointer - 1]; 734 tokens.mEndPointer--; 735 736 // Take care of last names like "de Sade" and "von Cliburn" 737 if ((tokens.mEndPointer - tokens.mStartPointer) > 0) { 738 String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1]; 739 if (isFamilyNamePrefix(lastNamePrefix)) { 740 if (tokens.hasDot(tokens.mEndPointer - 1)) { 741 lastNamePrefix += '.'; 742 } 743 name.familyName = lastNamePrefix + " " + name.familyName; 744 tokens.mEndPointer--; 745 } 746 } 747 } 748 749 /** 750 * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de" 751 */ 752 private boolean isFamilyNamePrefix(String word) { 753 final String normalized = word.toUpperCase(); 754 755 return mLastNamePrefixesSet.contains(normalized) 756 || mLastNamePrefixesSet.contains(normalized + "."); 757 } 758 759 760 private void parseMiddleName(Name name, NameTokenizer tokens) { 761 if (tokens.mStartPointer == tokens.mEndPointer) { 762 return; 763 } 764 765 if ((tokens.mEndPointer - tokens.mStartPointer) > 1) { 766 if ((tokens.mEndPointer - tokens.mStartPointer) == 2 767 || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2]. 768 toUpperCase())) { 769 name.middleName = tokens.mTokens[tokens.mEndPointer - 1]; 770 if (tokens.hasDot(tokens.mEndPointer - 1)) { 771 name.middleName += '.'; 772 } 773 tokens.mEndPointer--; 774 } 775 } 776 } 777 778 private void parseGivenNames(Name name, NameTokenizer tokens) { 779 if (tokens.mStartPointer == tokens.mEndPointer) { 780 return; 781 } 782 783 if ((tokens.mEndPointer - tokens.mStartPointer) == 1) { 784 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 785 } else { 786 StringBuilder sb = new StringBuilder(); 787 for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) { 788 if (i != tokens.mStartPointer) { 789 sb.append(' '); 790 } 791 sb.append(tokens.mTokens[i]); 792 if (tokens.hasDot(i)) { 793 sb.append('.'); 794 } 795 } 796 name.givenNames = sb.toString(); 797 } 798 } 799 800 /** 801 * Makes the best guess at the expected full name style based on the character set 802 * used in the supplied name. If the phonetic name is also supplied, tries to 803 * differentiate between Chinese, Japanese and Korean based on the alphabet used 804 * for the phonetic name. 805 */ 806 public void guessNameStyle(Name name) { 807 guessFullNameStyle(name); 808 if (FullNameStyle.CJK == name.fullNameStyle) { 809 name.fullNameStyle = getAdjustedFullNameStyle(name.fullNameStyle); 810 } 811 guessPhoneticNameStyle(name); 812 name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle, 813 name.phoneticNameStyle); 814 } 815 816 /** 817 * Updates the display name style according to the phonetic name style if we 818 * were unsure about display name style based on the name components, but 819 * phonetic name makes it more definitive. 820 */ 821 public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) { 822 if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 823 if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) { 824 if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) { 825 return FullNameStyle.JAPANESE; 826 } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) { 827 return FullNameStyle.KOREAN; 828 } 829 if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) { 830 return FullNameStyle.CHINESE; 831 } 832 } 833 } 834 return nameStyle; 835 } 836 837 /** 838 * Makes the best guess at the expected full name style based on the character set 839 * used in the supplied name. 840 */ 841 private void guessFullNameStyle(NameSplitter.Name name) { 842 if (name.fullNameStyle != FullNameStyle.UNDEFINED) { 843 return; 844 } 845 846 int bestGuess = guessFullNameStyle(name.givenNames); 847 // A mix of Hanzi and latin chars are common in China, so we have to go through all names 848 // if the name is not JANPANESE or KOREAN. 849 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK 850 && bestGuess != FullNameStyle.WESTERN) { 851 name.fullNameStyle = bestGuess; 852 return; 853 } 854 855 int guess = guessFullNameStyle(name.familyName); 856 if (guess != FullNameStyle.UNDEFINED) { 857 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 858 name.fullNameStyle = guess; 859 return; 860 } 861 bestGuess = guess; 862 } 863 864 guess = guessFullNameStyle(name.middleName); 865 if (guess != FullNameStyle.UNDEFINED) { 866 if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) { 867 name.fullNameStyle = guess; 868 return; 869 } 870 bestGuess = guess; 871 } 872 873 name.fullNameStyle = bestGuess; 874 } 875 876 public int guessFullNameStyle(String name) { 877 if (name == null) { 878 return FullNameStyle.UNDEFINED; 879 } 880 881 int nameStyle = FullNameStyle.UNDEFINED; 882 int length = name.length(); 883 int offset = 0; 884 while (offset < length) { 885 int codePoint = Character.codePointAt(name, offset); 886 if (Character.isLetter(codePoint)) { 887 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 888 889 if (!isLatinUnicodeBlock(unicodeBlock)) { 890 891 if (isCJKUnicodeBlock(unicodeBlock)) { 892 // We don't know if this is Chinese, Japanese or Korean - 893 // trying to figure out by looking at other characters in the name 894 return guessCJKNameStyle(name, offset + Character.charCount(codePoint)); 895 } 896 897 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 898 return FullNameStyle.JAPANESE; 899 } 900 901 if (isKoreanUnicodeBlock(unicodeBlock)) { 902 return FullNameStyle.KOREAN; 903 } 904 } 905 nameStyle = FullNameStyle.WESTERN; 906 } 907 offset += Character.charCount(codePoint); 908 } 909 return nameStyle; 910 } 911 912 private int guessCJKNameStyle(String name, int offset) { 913 int length = name.length(); 914 while (offset < length) { 915 int codePoint = Character.codePointAt(name, offset); 916 if (Character.isLetter(codePoint)) { 917 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 918 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 919 return FullNameStyle.JAPANESE; 920 } 921 if (isKoreanUnicodeBlock(unicodeBlock)) { 922 return FullNameStyle.KOREAN; 923 } 924 } 925 offset += Character.charCount(codePoint); 926 } 927 928 return FullNameStyle.CJK; 929 } 930 931 private void guessPhoneticNameStyle(NameSplitter.Name name) { 932 if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) { 933 return; 934 } 935 936 int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName); 937 if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) { 938 name.phoneticNameStyle = bestGuess; 939 return; 940 } 941 942 int guess = guessPhoneticNameStyle(name.phoneticGivenName); 943 if (guess != FullNameStyle.UNDEFINED) { 944 if (guess != FullNameStyle.CJK) { 945 name.phoneticNameStyle = guess; 946 return; 947 } 948 bestGuess = guess; 949 } 950 951 guess = guessPhoneticNameStyle(name.phoneticMiddleName); 952 if (guess != FullNameStyle.UNDEFINED) { 953 if (guess != FullNameStyle.CJK) { 954 name.phoneticNameStyle = guess; 955 return; 956 } 957 bestGuess = guess; 958 } 959 } 960 961 public int guessPhoneticNameStyle(String name) { 962 if (name == null) { 963 return PhoneticNameStyle.UNDEFINED; 964 } 965 966 int nameStyle = PhoneticNameStyle.UNDEFINED; 967 int length = name.length(); 968 int offset = 0; 969 while (offset < length) { 970 int codePoint = Character.codePointAt(name, offset); 971 if (Character.isLetter(codePoint)) { 972 UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint); 973 if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) { 974 return PhoneticNameStyle.JAPANESE; 975 } 976 if (isKoreanUnicodeBlock(unicodeBlock)) { 977 return PhoneticNameStyle.KOREAN; 978 } 979 if (isLatinUnicodeBlock(unicodeBlock)) { 980 return PhoneticNameStyle.PINYIN; 981 } 982 } 983 offset += Character.charCount(codePoint); 984 } 985 986 return nameStyle; 987 } 988 989 private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) { 990 return unicodeBlock == UnicodeBlock.BASIC_LATIN || 991 unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT || 992 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A || 993 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B || 994 unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL; 995 } 996 997 private static boolean isCJKUnicodeBlock(UnicodeBlock block) { 998 return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 999 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1000 || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1001 || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION 1002 || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT 1003 || block == UnicodeBlock.CJK_COMPATIBILITY 1004 || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS 1005 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 1006 || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT; 1007 } 1008 1009 private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) { 1010 return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES || 1011 unicodeBlock == UnicodeBlock.HANGUL_JAMO || 1012 unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO; 1013 } 1014 1015 private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) { 1016 return unicodeBlock == UnicodeBlock.KATAKANA || 1017 unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS || 1018 unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || 1019 unicodeBlock == UnicodeBlock.HIRAGANA; 1020 } 1021} 1022