1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16package com.android.providers.contacts;
17
18import android.content.ContentValues;
19import android.provider.ContactsContract.CommonDataKinds.StructuredName;
20import android.provider.ContactsContract.FullNameStyle;
21import android.provider.ContactsContract.PhoneticNameStyle;
22import android.text.TextUtils;
23
24import com.android.providers.contacts.util.NeededForTesting;
25
26import java.lang.Character.UnicodeBlock;
27import java.util.HashSet;
28import java.util.Locale;
29import java.util.StringTokenizer;
30
31/**
32 * The purpose of this class is to split a full name into given names and last
33 * name. The logic only supports having a single last name. If the full name has
34 * multiple last names the output will be incorrect.
35 * <p>
36 * Core algorithm:
37 * <ol>
38 * <li>Remove the suffixes (III, Ph.D., M.D.).</li>
39 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li>
40 * <li>Assign the last remaining token as the last name.</li>
41 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use
42 * this word also as the last name.</li>
43 * <li>Assign the rest of the words as the "given names".</li>
44 * </ol>
45 */
46public class NameSplitter {
47
48    public static final int MAX_TOKENS = 10;
49
50    private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase();
51    private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase();
52
53    // This includes simplified and traditional Chinese
54    private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase();
55
56    private final HashSet<String> mPrefixesSet;
57    private final HashSet<String> mSuffixesSet;
58    private final int mMaxSuffixLength;
59    private final HashSet<String> mLastNamePrefixesSet;
60    private final HashSet<String> mConjuctions;
61    private final Locale mLocale;
62    private final String mLanguage;
63
64    /**
65     * Two-Chracter long Korean family names.
66     * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1
67     */
68    private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = {
69        "\uAC15\uC804", // Gang Jeon
70        "\uB0A8\uAD81", // Nam Goong
71        "\uB3C5\uACE0", // Dok Go
72        "\uB3D9\uBC29", // Dong Bang
73        "\uB9DD\uC808", // Mang Jeol
74        "\uC0AC\uACF5", // Sa Gong
75        "\uC11C\uBB38", // Seo Moon
76        "\uC120\uC6B0", // Seon Woo
77        "\uC18C\uBD09", // So Bong
78        "\uC5B4\uAE08", // Uh Geum
79        "\uC7A5\uACE1", // Jang Gok
80        "\uC81C\uAC08", // Je Gal
81        "\uD669\uBCF4"  // Hwang Bo
82    };
83
84    public static class Name {
85        public String prefix;
86        public String givenNames;
87        public String middleName;
88        public String familyName;
89        public String suffix;
90
91        public int fullNameStyle;
92
93        public String phoneticFamilyName;
94        public String phoneticMiddleName;
95        public String phoneticGivenName;
96
97        public int phoneticNameStyle;
98
99        public Name() {
100        }
101
102        public Name(String prefix, String givenNames, String middleName, String familyName,
103                String suffix) {
104            this.prefix = prefix;
105            this.givenNames = givenNames;
106            this.middleName = middleName;
107            this.familyName = familyName;
108            this.suffix = suffix;
109        }
110
111        @NeededForTesting
112        public String getPrefix() {
113            return prefix;
114        }
115
116        public String getGivenNames() {
117            return givenNames;
118        }
119
120        public String getMiddleName() {
121            return middleName;
122        }
123
124        public String getFamilyName() {
125            return familyName;
126        }
127
128        @NeededForTesting
129        public String getSuffix() {
130            return suffix;
131        }
132
133        public int getFullNameStyle() {
134            return fullNameStyle;
135        }
136
137        public String getPhoneticFamilyName() {
138            return phoneticFamilyName;
139        }
140
141        public String getPhoneticMiddleName() {
142            return phoneticMiddleName;
143        }
144
145        public String getPhoneticGivenName() {
146            return phoneticGivenName;
147        }
148
149        public int getPhoneticNameStyle() {
150            return phoneticNameStyle;
151        }
152
153        public void fromValues(ContentValues values) {
154            prefix = values.getAsString(StructuredName.PREFIX);
155            givenNames = values.getAsString(StructuredName.GIVEN_NAME);
156            middleName = values.getAsString(StructuredName.MIDDLE_NAME);
157            familyName = values.getAsString(StructuredName.FAMILY_NAME);
158            suffix = values.getAsString(StructuredName.SUFFIX);
159
160            Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE);
161            fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer;
162
163            phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME);
164            phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME);
165            phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME);
166
167            integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE);
168            phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer;
169        }
170
171        public void toValues(ContentValues values) {
172            putValueIfPresent(values, StructuredName.PREFIX, prefix);
173            putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames);
174            putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName);
175            putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName);
176            putValueIfPresent(values, StructuredName.SUFFIX, suffix);
177            values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle);
178            putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName);
179            putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName);
180            putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName);
181            values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle);
182        }
183
184        private void putValueIfPresent(ContentValues values, String name, String value) {
185            if (value != null) {
186                values.put(name, value);
187            }
188        }
189
190        public void clear() {
191            prefix = null;
192            givenNames = null;
193            middleName = null;
194            familyName = null;
195            suffix = null;
196            fullNameStyle = FullNameStyle.UNDEFINED;
197            phoneticFamilyName = null;
198            phoneticMiddleName = null;
199            phoneticGivenName = null;
200            phoneticNameStyle = PhoneticNameStyle.UNDEFINED;
201        }
202
203        public boolean isEmpty() {
204            return TextUtils.isEmpty(givenNames)
205                    && TextUtils.isEmpty(middleName)
206                    && TextUtils.isEmpty(familyName)
207                    && TextUtils.isEmpty(suffix)
208                    && TextUtils.isEmpty(phoneticFamilyName)
209                    && TextUtils.isEmpty(phoneticMiddleName)
210                    && TextUtils.isEmpty(phoneticGivenName);
211        }
212
213        @Override
214        public String toString() {
215            return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName
216                    + " family: " + familyName + " suffix: " + suffix + " ph/given: "
217                    + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: "
218                    + phoneticFamilyName + "]";
219        }
220    }
221
222    private static class NameTokenizer extends StringTokenizer {
223        private final String[] mTokens;
224        private int mDotBitmask;
225        private int mCommaBitmask;
226        private int mStartPointer;
227        private int mEndPointer;
228
229        public NameTokenizer(String fullName) {
230            super(fullName, " .,", true);
231
232            mTokens = new String[MAX_TOKENS];
233
234            // Iterate over tokens, skipping over empty ones and marking tokens that
235            // are followed by dots.
236            while (hasMoreTokens() && mEndPointer < MAX_TOKENS) {
237                final String token = nextToken();
238                if (token.length() > 0) {
239                    final char c = token.charAt(0);
240                    if (c == ' ') {
241                        continue;
242                    }
243                }
244
245                if (mEndPointer > 0 && token.charAt(0) == '.') {
246                    mDotBitmask |= (1 << (mEndPointer - 1));
247                } else if (mEndPointer > 0 && token.charAt(0) == ',') {
248                    mCommaBitmask |= (1 << (mEndPointer - 1));
249                } else {
250                    mTokens[mEndPointer] = token;
251                    mEndPointer++;
252                }
253            }
254        }
255
256        /**
257         * Returns true if the token is followed by a dot in the original full name.
258         */
259        public boolean hasDot(int index) {
260            return (mDotBitmask & (1 << index)) != 0;
261        }
262
263        /**
264         * Returns true if the token is followed by a comma in the original full name.
265         */
266        public boolean hasComma(int index) {
267            return (mCommaBitmask & (1 << index)) != 0;
268        }
269    }
270
271    /**
272     * Constructor.
273     *
274     * @param commonPrefixes comma-separated list of common prefixes,
275     *            e.g. "Mr, Ms, Mrs"
276     * @param commonLastNamePrefixes comma-separated list of common last name prefixes,
277     *            e.g. "d', st, st., von"
278     * @param commonSuffixes comma-separated list of common suffixes,
279     *            e.g. "Jr, M.D., MD, D.D.S."
280     * @param commonConjunctions comma-separated list of common conjuctions,
281     *            e.g. "AND, Or"
282     */
283    public NameSplitter(String commonPrefixes, String commonLastNamePrefixes,
284            String commonSuffixes, String commonConjunctions, Locale locale) {
285        // TODO: refactor this to use <string-array> resources
286        mPrefixesSet = convertToSet(commonPrefixes);
287        mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes);
288        mSuffixesSet = convertToSet(commonSuffixes);
289        mConjuctions = convertToSet(commonConjunctions);
290        mLocale = locale != null ? locale : Locale.getDefault();
291        mLanguage = mLocale.getLanguage().toLowerCase();
292
293        int maxLength = 0;
294        for (String suffix : mSuffixesSet) {
295            if (suffix.length() > maxLength) {
296                maxLength = suffix.length();
297            }
298        }
299
300        mMaxSuffixLength = maxLength;
301    }
302
303    /**
304     * Converts a comma-separated list of Strings to a set of Strings. Trims strings
305     * and converts them to upper case.
306     */
307    private static HashSet<String> convertToSet(String strings) {
308        HashSet<String> set = new HashSet<String>();
309        if (strings != null) {
310            String[] split = strings.split(",");
311            for (int i = 0; i < split.length; i++) {
312                set.add(split[i].trim().toUpperCase());
313            }
314        }
315        return set;
316    }
317
318    /**
319     * Parses a full name and returns components as a list of tokens.
320     */
321    public int tokenize(String[] tokens, String fullName) {
322        if (fullName == null) {
323            return 0;
324        }
325
326        NameTokenizer tokenizer = new NameTokenizer(fullName);
327
328        if (tokenizer.mStartPointer == tokenizer.mEndPointer) {
329            return 0;
330        }
331
332        String firstToken = tokenizer.mTokens[tokenizer.mStartPointer];
333        if (mPrefixesSet.contains(firstToken.toUpperCase())) {
334           tokenizer.mStartPointer++;
335        }
336        int count = 0;
337        for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) {
338            tokens[count++] = tokenizer.mTokens[i];
339        }
340
341        return count;
342    }
343
344
345    /**
346     * Parses a full name and returns parsed components in the Name object.
347     */
348    public void split(Name name, String fullName) {
349        if (fullName == null) {
350            return;
351        }
352
353        int fullNameStyle = guessFullNameStyle(fullName);
354        if (fullNameStyle == FullNameStyle.CJK) {
355            fullNameStyle = getAdjustedFullNameStyle(fullNameStyle);
356        }
357
358        split(name, fullName, fullNameStyle);
359    }
360
361    /**
362     * Parses a full name and returns parsed components in the Name object
363     * with a given fullNameStyle.
364     */
365    public void split(Name name, String fullName, int fullNameStyle) {
366        if (fullName == null) {
367            return;
368        }
369
370        name.fullNameStyle = fullNameStyle;
371
372        switch (fullNameStyle) {
373            case FullNameStyle.CHINESE:
374                splitChineseName(name, fullName);
375                break;
376
377            case FullNameStyle.JAPANESE:
378                splitJapaneseName(name, fullName);
379                break;
380
381            case FullNameStyle.KOREAN:
382                splitKoreanName(name, fullName);
383                break;
384
385            default:
386                splitWesternName(name, fullName);
387        }
388    }
389
390    /**
391     * Splits a full name composed according to the Western tradition:
392     * <pre>
393     *   [prefix] given name(s) [[middle name] family name] [, suffix]
394     *   [prefix] family name, given name [middle name] [,suffix]
395     * </pre>
396     */
397    private void splitWesternName(Name name, String fullName) {
398        NameTokenizer tokens = new NameTokenizer(fullName);
399        parsePrefix(name, tokens);
400
401        // If the name consists of just one or two tokens, treat them as first/last name,
402        // not as suffix.  Example: John Ma; Ma is last name, not "M.A.".
403        if (tokens.mEndPointer > 2) {
404            parseSuffix(name, tokens);
405        }
406
407        if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) {
408            name.givenNames = tokens.mTokens[tokens.mStartPointer];
409        } else {
410            parseLastName(name, tokens);
411            parseMiddleName(name, tokens);
412            parseGivenNames(name, tokens);
413        }
414    }
415
416    /**
417     * Splits a full name composed according to the Chinese tradition:
418     * <pre>
419     *   [family name [middle name]] given name
420     * </pre>
421     */
422    private void splitChineseName(Name name, String fullName) {
423        StringTokenizer tokenizer = new StringTokenizer(fullName);
424        while (tokenizer.hasMoreTokens()) {
425            String token = tokenizer.nextToken();
426            if (name.givenNames == null) {
427                name.givenNames = token;
428            } else if (name.familyName == null) {
429                name.familyName = name.givenNames;
430                name.givenNames = token;
431            } else if (name.middleName == null) {
432                name.middleName = name.givenNames;
433                name.givenNames = token;
434            } else {
435                name.middleName = name.middleName + name.givenNames;
436                name.givenNames = token;
437            }
438        }
439
440        // If a single word parse that word up.
441        if (name.givenNames != null && name.familyName == null && name.middleName == null) {
442            int length = fullName.length();
443            if (length == 2) {
444                name.familyName = fullName.substring(0, 1);
445                name.givenNames = fullName.substring(1);
446            } else if (length == 3) {
447                name.familyName = fullName.substring(0, 1);
448                name.middleName = fullName.substring(1, 2);
449                name.givenNames = fullName.substring(2);
450            } else if (length == 4) {
451                name.familyName = fullName.substring(0, 2);
452                name.middleName = fullName.substring(2, 3);
453                name.givenNames = fullName.substring(3);
454            }
455
456        }
457    }
458
459    /**
460     * Splits a full name composed according to the Japanese tradition:
461     * <pre>
462     *   [family name] given name(s)
463     * </pre>
464     */
465    private void splitJapaneseName(Name name, String fullName) {
466        StringTokenizer tokenizer = new StringTokenizer(fullName);
467        while (tokenizer.hasMoreTokens()) {
468            String token = tokenizer.nextToken();
469            if (name.givenNames == null) {
470                name.givenNames = token;
471            } else if (name.familyName == null) {
472                name.familyName = name.givenNames;
473                name.givenNames = token;
474            } else {
475                name.givenNames += " " + token;
476            }
477        }
478    }
479
480    /**
481     * Splits a full name composed according to the Korean tradition:
482     * <pre>
483     *   [family name] given name(s)
484     * </pre>
485     */
486    private void splitKoreanName(Name name, String fullName) {
487        StringTokenizer tokenizer = new StringTokenizer(fullName);
488        if (tokenizer.countTokens() > 1) {
489            // Each name can be identified by separators.
490            while (tokenizer.hasMoreTokens()) {
491                String token = tokenizer.nextToken();
492                if (name.givenNames == null) {
493                    name.givenNames = token;
494                } else if (name.familyName == null) {
495                    name.familyName = name.givenNames;
496                    name.givenNames = token;
497                } else {
498                    name.givenNames += " " + token;
499                }
500            }
501        } else {
502            // There is no separator. Try to guess family name.
503            // The length of most family names is 1.
504            int familyNameLength = 1;
505
506            // Compare with 2-length family names.
507            for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) {
508                if (fullName.startsWith(twoLengthFamilyName)) {
509                    familyNameLength = 2;
510                    break;
511                }
512            }
513
514            name.familyName = fullName.substring(0, familyNameLength);
515            if (fullName.length() > familyNameLength) {
516                name.givenNames = fullName.substring(familyNameLength);
517            }
518        }
519    }
520
521    /**
522     * Concatenates components of a name according to the rules dictated by the name style.
523     *
524     * @param givenNameFirst is ignored for CJK display name styles
525     */
526    public String join(Name name, boolean givenNameFirst, boolean includePrefix) {
527        String prefix = includePrefix ? name.prefix : null;
528        switch (name.fullNameStyle) {
529            case FullNameStyle.CJK:
530            case FullNameStyle.CHINESE:
531            case FullNameStyle.KOREAN:
532                return join(prefix, name.familyName, name.middleName, name.givenNames,
533                        name.suffix, false, false, false);
534
535            case FullNameStyle.JAPANESE:
536                return join(prefix, name.familyName, name.middleName, name.givenNames,
537                        name.suffix, true, false, false);
538
539            default:
540                if (givenNameFirst) {
541                    return join(prefix, name.givenNames, name.middleName, name.familyName,
542                            name.suffix, true, false, true);
543                } else {
544                    return join(prefix, name.familyName, name.givenNames, name.middleName,
545                            name.suffix, true, true, true);
546                }
547        }
548    }
549
550    /**
551     * Concatenates components of the phonetic name following the CJK tradition:
552     * family name + middle name + given name(s).
553     */
554    public String joinPhoneticName(Name name) {
555        return join(null, name.phoneticFamilyName,
556                name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false);
557    }
558
559    /**
560     * Concatenates parts of a full name inserting spaces and commas as specified.
561     */
562    private String join(String prefix, String part1, String part2, String part3, String suffix,
563            boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) {
564        prefix = prefix == null ? null: prefix.trim();
565        part1 = part1 == null ? null: part1.trim();
566        part2 = part2 == null ? null: part2.trim();
567        part3 = part3 == null ? null: part3.trim();
568        suffix = suffix == null ? null: suffix.trim();
569
570        boolean hasPrefix = !TextUtils.isEmpty(prefix);
571        boolean hasPart1 = !TextUtils.isEmpty(part1);
572        boolean hasPart2 = !TextUtils.isEmpty(part2);
573        boolean hasPart3 = !TextUtils.isEmpty(part3);
574        boolean hasSuffix = !TextUtils.isEmpty(suffix);
575
576        boolean isSingleWord = true;
577        String singleWord = null;
578
579        if (hasPrefix) {
580            singleWord = prefix;
581        }
582
583        if (hasPart1) {
584            if (singleWord != null) {
585                isSingleWord = false;
586            } else {
587                singleWord = part1;
588            }
589        }
590
591        if (hasPart2) {
592            if (singleWord != null) {
593                isSingleWord = false;
594            } else {
595                singleWord = part2;
596            }
597        }
598
599        if (hasPart3) {
600            if (singleWord != null) {
601                isSingleWord = false;
602            } else {
603                singleWord = part3;
604            }
605        }
606
607        if (hasSuffix) {
608            if (singleWord != null) {
609                isSingleWord = false;
610            } else {
611                singleWord = normalizedSuffix(suffix);
612            }
613        }
614
615        if (isSingleWord) {
616            return singleWord;
617        }
618
619        StringBuilder sb = new StringBuilder();
620
621        if (hasPrefix) {
622            sb.append(prefix);
623        }
624
625        if (hasPart1) {
626            if (hasPrefix) {
627                sb.append(' ');
628            }
629            sb.append(part1);
630        }
631
632        if (hasPart2) {
633            if (hasPrefix || hasPart1) {
634                if (useCommaAfterPart1) {
635                    sb.append(',');
636                }
637                if (useSpace) {
638                    sb.append(' ');
639                }
640            }
641            sb.append(part2);
642        }
643
644        if (hasPart3) {
645            if (hasPrefix || hasPart1 || hasPart2) {
646                if (useSpace) {
647                    sb.append(' ');
648                }
649            }
650            sb.append(part3);
651        }
652
653        if (hasSuffix) {
654            if (hasPrefix || hasPart1 || hasPart2 || hasPart3) {
655                if (useCommaAfterPart3) {
656                    sb.append(',');
657                }
658                if (useSpace) {
659                    sb.append(' ');
660                }
661            }
662            sb.append(normalizedSuffix(suffix));
663        }
664
665        return sb.toString();
666    }
667
668    /**
669     * Puts a dot after the supplied suffix if that is the accepted form of the suffix,
670     * e.g. "Jr." and "Sr.", but not "I", "II" and "III".
671     */
672    private String normalizedSuffix(String suffix) {
673        int length = suffix.length();
674        if (length == 0 || suffix.charAt(length - 1) == '.') {
675            return suffix;
676        }
677
678        String withDot = suffix + '.';
679        if (mSuffixesSet.contains(withDot.toUpperCase())) {
680            return withDot;
681        } else {
682            return suffix;
683        }
684    }
685
686    /**
687     * If the supplied name style is undefined, returns a default based on the language,
688     * otherwise returns the supplied name style itself.
689     *
690     * @param nameStyle See {@link FullNameStyle}.
691     */
692    public int getAdjustedFullNameStyle(int nameStyle) {
693        if (nameStyle == FullNameStyle.UNDEFINED) {
694            if (JAPANESE_LANGUAGE.equals(mLanguage)) {
695                return FullNameStyle.JAPANESE;
696            } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
697                return FullNameStyle.KOREAN;
698            } else if (CHINESE_LANGUAGE.equals(mLanguage)) {
699                return FullNameStyle.CHINESE;
700            } else {
701                return FullNameStyle.WESTERN;
702            }
703        } else if (nameStyle == FullNameStyle.CJK) {
704            if (JAPANESE_LANGUAGE.equals(mLanguage)) {
705                return FullNameStyle.JAPANESE;
706            } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
707                return FullNameStyle.KOREAN;
708            } else {
709                return FullNameStyle.CHINESE;
710            }
711        }
712        return nameStyle;
713    }
714
715    /**
716     * Parses the first word from the name if it is a prefix.
717     */
718    private void parsePrefix(Name name, NameTokenizer tokens) {
719        if (tokens.mStartPointer == tokens.mEndPointer) {
720            return;
721        }
722
723        String firstToken = tokens.mTokens[tokens.mStartPointer];
724        if (mPrefixesSet.contains(firstToken.toUpperCase())) {
725            if (tokens.hasDot(tokens.mStartPointer)) {
726                firstToken += '.';
727            }
728            name.prefix = firstToken;
729            tokens.mStartPointer++;
730        }
731    }
732
733    /**
734     * Parses the last word(s) from the name if it is a suffix.
735     */
736    private void parseSuffix(Name name, NameTokenizer tokens) {
737        if (tokens.mStartPointer == tokens.mEndPointer) {
738            return;
739        }
740
741        String lastToken = tokens.mTokens[tokens.mEndPointer - 1];
742
743        // Take care of an explicit comma-separated suffix
744        if (tokens.mEndPointer - tokens.mStartPointer > 2
745                && tokens.hasComma(tokens.mEndPointer - 2)) {
746            if (tokens.hasDot(tokens.mEndPointer - 1)) {
747                lastToken += '.';
748            }
749            name.suffix = lastToken;
750            tokens.mEndPointer--;
751            return;
752        }
753
754        if (lastToken.length() > mMaxSuffixLength) {
755            return;
756        }
757
758        String normalized = lastToken.toUpperCase();
759        if (mSuffixesSet.contains(normalized)) {
760            name.suffix = lastToken;
761            tokens.mEndPointer--;
762            return;
763        }
764
765        if (tokens.hasDot(tokens.mEndPointer - 1)) {
766            lastToken += '.';
767        }
768        normalized += ".";
769
770        // Take care of suffixes like M.D. and D.D.S.
771        int pos = tokens.mEndPointer - 1;
772        while (normalized.length() <= mMaxSuffixLength) {
773
774            if (mSuffixesSet.contains(normalized)) {
775                name.suffix = lastToken;
776                tokens.mEndPointer = pos;
777                return;
778            }
779
780            if (pos == tokens.mStartPointer) {
781                break;
782            }
783
784            pos--;
785            if (tokens.hasDot(pos)) {
786                lastToken = tokens.mTokens[pos] + "." + lastToken;
787            } else {
788                lastToken = tokens.mTokens[pos] + " " + lastToken;
789            }
790
791            normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized;
792        }
793    }
794
795    private void parseLastName(Name name, NameTokenizer tokens) {
796        if (tokens.mStartPointer == tokens.mEndPointer) {
797            return;
798        }
799
800        // If the first word is followed by a comma, assume that it's the family name
801        if (tokens.hasComma(tokens.mStartPointer)) {
802           name.familyName = tokens.mTokens[tokens.mStartPointer];
803           tokens.mStartPointer++;
804           return;
805        }
806
807        // If the second word is followed by a comma and the first word
808        // is a last name prefix as in "de Sade" and "von Cliburn", treat
809        // the first two words as the family name.
810        if (tokens.mStartPointer + 1 < tokens.mEndPointer
811                && tokens.hasComma(tokens.mStartPointer + 1)
812                && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) {
813            String familyNamePrefix = tokens.mTokens[tokens.mStartPointer];
814            if (tokens.hasDot(tokens.mStartPointer)) {
815                familyNamePrefix += '.';
816            }
817            name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1];
818            tokens.mStartPointer += 2;
819            return;
820        }
821
822        // Finally, assume that the last word is the last name
823        name.familyName = tokens.mTokens[tokens.mEndPointer - 1];
824        tokens.mEndPointer--;
825
826        // Take care of last names like "de Sade" and "von Cliburn"
827        if ((tokens.mEndPointer - tokens.mStartPointer) > 0) {
828            String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1];
829            if (isFamilyNamePrefix(lastNamePrefix)) {
830                if (tokens.hasDot(tokens.mEndPointer - 1)) {
831                    lastNamePrefix += '.';
832                }
833                name.familyName = lastNamePrefix + " " + name.familyName;
834                tokens.mEndPointer--;
835            }
836        }
837    }
838
839    /**
840     * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de"
841     */
842    private boolean isFamilyNamePrefix(String word) {
843        final String normalized = word.toUpperCase();
844
845        return mLastNamePrefixesSet.contains(normalized)
846                || mLastNamePrefixesSet.contains(normalized + ".");
847    }
848
849
850    private void parseMiddleName(Name name, NameTokenizer tokens) {
851        if (tokens.mStartPointer == tokens.mEndPointer) {
852            return;
853        }
854
855        if ((tokens.mEndPointer - tokens.mStartPointer) > 1) {
856            if ((tokens.mEndPointer - tokens.mStartPointer) == 2
857                    || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2].
858                            toUpperCase())) {
859                name.middleName = tokens.mTokens[tokens.mEndPointer - 1];
860                if (tokens.hasDot(tokens.mEndPointer - 1)) {
861                    name.middleName += '.';
862                }
863                tokens.mEndPointer--;
864            }
865        }
866    }
867
868    private void parseGivenNames(Name name, NameTokenizer tokens) {
869        if (tokens.mStartPointer == tokens.mEndPointer) {
870            return;
871        }
872
873        if ((tokens.mEndPointer - tokens.mStartPointer) == 1) {
874            name.givenNames = tokens.mTokens[tokens.mStartPointer];
875        } else {
876            StringBuilder sb = new StringBuilder();
877            for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) {
878                if (i != tokens.mStartPointer) {
879                    sb.append(' ');
880                }
881                sb.append(tokens.mTokens[i]);
882                if (tokens.hasDot(i)) {
883                    sb.append('.');
884                }
885            }
886            name.givenNames = sb.toString();
887        }
888    }
889
890    /**
891     * Makes the best guess at the expected full name style based on the character set
892     * used in the supplied name.  If the phonetic name is also supplied, tries to
893     * differentiate between Chinese, Japanese and Korean based on the alphabet used
894     * for the phonetic name.
895     */
896    public void guessNameStyle(Name name) {
897        guessFullNameStyle(name);
898        guessPhoneticNameStyle(name);
899        name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle,
900                name.phoneticNameStyle);
901    }
902
903    /**
904     * Updates the display name style according to the phonetic name style if we
905     * were unsure about display name style based on the name components, but
906     * phonetic name makes it more definitive.
907     */
908    public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) {
909        if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
910            if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) {
911                if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) {
912                    return FullNameStyle.JAPANESE;
913                } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) {
914                    return FullNameStyle.KOREAN;
915                }
916                if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) {
917                    return FullNameStyle.CHINESE;
918                }
919            }
920        }
921        return nameStyle;
922    }
923
924    /**
925     * Makes the best guess at the expected full name style based on the character set
926     * used in the supplied name.
927     */
928    private void guessFullNameStyle(NameSplitter.Name name) {
929        if (name.fullNameStyle != FullNameStyle.UNDEFINED) {
930            return;
931        }
932
933        int bestGuess = guessFullNameStyle(name.givenNames);
934        // A mix of Hanzi and latin chars are common in China, so we have to go through all names
935        // if the name is not JANPANESE or KOREAN.
936        if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK
937                && bestGuess != FullNameStyle.WESTERN) {
938            name.fullNameStyle = bestGuess;
939            return;
940        }
941
942        int guess = guessFullNameStyle(name.familyName);
943        if (guess != FullNameStyle.UNDEFINED) {
944            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
945                name.fullNameStyle = guess;
946                return;
947            }
948            bestGuess = guess;
949        }
950
951        guess = guessFullNameStyle(name.middleName);
952        if (guess != FullNameStyle.UNDEFINED) {
953            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
954                name.fullNameStyle = guess;
955                return;
956            }
957            bestGuess = guess;
958        }
959
960        guess = guessFullNameStyle(name.prefix);
961        if (guess != FullNameStyle.UNDEFINED) {
962            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
963                name.fullNameStyle = guess;
964                return;
965            }
966            bestGuess = guess;
967        }
968
969        guess = guessFullNameStyle(name.suffix);
970        if (guess != FullNameStyle.UNDEFINED) {
971            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
972                name.fullNameStyle = guess;
973                return;
974            }
975            bestGuess = guess;
976        }
977
978        name.fullNameStyle = bestGuess;
979    }
980
981    public int guessFullNameStyle(String name) {
982        if (name == null) {
983            return FullNameStyle.UNDEFINED;
984        }
985
986        int nameStyle = FullNameStyle.UNDEFINED;
987        int length = name.length();
988        int offset = 0;
989        while (offset < length) {
990            int codePoint = Character.codePointAt(name, offset);
991            if (Character.isLetter(codePoint)) {
992                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
993
994                if (!isLatinUnicodeBlock(unicodeBlock)) {
995
996                    if (isCJKUnicodeBlock(unicodeBlock)) {
997                        // We don't know if this is Chinese, Japanese or Korean -
998                        // trying to figure out by looking at other characters in the name
999                        return guessCJKNameStyle(name, offset + Character.charCount(codePoint));
1000                    }
1001
1002                    if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1003                        return FullNameStyle.JAPANESE;
1004                    }
1005
1006                    if (isKoreanUnicodeBlock(unicodeBlock)) {
1007                        return FullNameStyle.KOREAN;
1008                    }
1009                }
1010                nameStyle = FullNameStyle.WESTERN;
1011            }
1012            offset += Character.charCount(codePoint);
1013        }
1014        return nameStyle;
1015    }
1016
1017    private int guessCJKNameStyle(String name, int offset) {
1018        int length = name.length();
1019        while (offset < length) {
1020            int codePoint = Character.codePointAt(name, offset);
1021            if (Character.isLetter(codePoint)) {
1022                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
1023                if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1024                    return FullNameStyle.JAPANESE;
1025                }
1026                if (isKoreanUnicodeBlock(unicodeBlock)) {
1027                    return FullNameStyle.KOREAN;
1028                }
1029            }
1030            offset += Character.charCount(codePoint);
1031        }
1032
1033        return FullNameStyle.CJK;
1034    }
1035
1036    private void guessPhoneticNameStyle(NameSplitter.Name name) {
1037        if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
1038            return;
1039        }
1040
1041        int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName);
1042        if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) {
1043            name.phoneticNameStyle = bestGuess;
1044            return;
1045        }
1046
1047        int guess = guessPhoneticNameStyle(name.phoneticGivenName);
1048        if (guess != FullNameStyle.UNDEFINED) {
1049            if (guess != FullNameStyle.CJK) {
1050                name.phoneticNameStyle = guess;
1051                return;
1052            }
1053            bestGuess = guess;
1054        }
1055
1056        guess = guessPhoneticNameStyle(name.phoneticMiddleName);
1057        if (guess != FullNameStyle.UNDEFINED) {
1058            if (guess != FullNameStyle.CJK) {
1059                name.phoneticNameStyle = guess;
1060                return;
1061            }
1062            bestGuess = guess;
1063        }
1064    }
1065
1066    public int guessPhoneticNameStyle(String name) {
1067        if (name == null) {
1068            return PhoneticNameStyle.UNDEFINED;
1069        }
1070
1071        int nameStyle = PhoneticNameStyle.UNDEFINED;
1072        int length = name.length();
1073        int offset = 0;
1074        while (offset < length) {
1075            int codePoint = Character.codePointAt(name, offset);
1076            if (Character.isLetter(codePoint)) {
1077                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
1078                if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1079                    return PhoneticNameStyle.JAPANESE;
1080                }
1081                if (isKoreanUnicodeBlock(unicodeBlock)) {
1082                    return PhoneticNameStyle.KOREAN;
1083                }
1084                if (isLatinUnicodeBlock(unicodeBlock)) {
1085                    return PhoneticNameStyle.PINYIN;
1086                }
1087            }
1088            offset += Character.charCount(codePoint);
1089        }
1090
1091        return nameStyle;
1092    }
1093
1094    private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) {
1095        return unicodeBlock == UnicodeBlock.BASIC_LATIN ||
1096                unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT ||
1097                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A ||
1098                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B ||
1099                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL;
1100    }
1101
1102    private static boolean isCJKUnicodeBlock(UnicodeBlock block) {
1103        return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
1104                || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1105                || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1106                || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
1107                || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT
1108                || block == UnicodeBlock.CJK_COMPATIBILITY
1109                || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS
1110                || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
1111                || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT;
1112    }
1113
1114    private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) {
1115        return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES ||
1116                unicodeBlock == UnicodeBlock.HANGUL_JAMO ||
1117                unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO;
1118    }
1119
1120    private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) {
1121        return unicodeBlock == UnicodeBlock.KATAKANA ||
1122                unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS ||
1123                unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS ||
1124                unicodeBlock == UnicodeBlock.HIRAGANA;
1125    }
1126}
1127