1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16package com.android.providers.contacts;
17
18import android.content.ContentValues;
19import android.provider.ContactsContract.CommonDataKinds.StructuredName;
20import android.provider.ContactsContract.FullNameStyle;
21import android.provider.ContactsContract.PhoneticNameStyle;
22import android.text.TextUtils;
23
24import com.android.providers.contacts.util.NeededForTesting;
25
26import java.lang.Character.UnicodeBlock;
27import java.util.HashSet;
28import java.util.Locale;
29import java.util.StringTokenizer;
30
31/**
32 * The purpose of this class is to split a full name into given names and last
33 * name. The logic only supports having a single last name. If the full name has
34 * multiple last names the output will be incorrect.
35 * <p>
36 * Core algorithm:
37 * <ol>
38 * <li>Remove the suffixes (III, Ph.D., M.D.).</li>
39 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li>
40 * <li>Assign the last remaining token as the last name.</li>
41 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use
42 * this word also as the last name.</li>
43 * <li>Assign the rest of the words as the "given names".</li>
44 * </ol>
45 */
46public class NameSplitter {
47
48    public static final int MAX_TOKENS = 10;
49
50    private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase();
51    private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase();
52
53    // This includes simplified and traditional Chinese
54    private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase();
55
56    private final HashSet<String> mPrefixesSet;
57    private final HashSet<String> mSuffixesSet;
58    private final int mMaxSuffixLength;
59    private final HashSet<String> mLastNamePrefixesSet;
60    private final HashSet<String> mConjuctions;
61    private final Locale mLocale;
62    private final String mLanguage;
63
64    /**
65     * Two-Chracter long Korean family names.
66     * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1
67     */
68    private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = {
69        "\uAC15\uC804", // Gang Jeon
70        "\uB0A8\uAD81", // Nam Goong
71        "\uB3C5\uACE0", // Dok Go
72        "\uB3D9\uBC29", // Dong Bang
73        "\uB9DD\uC808", // Mang Jeol
74        "\uC0AC\uACF5", // Sa Gong
75        "\uC11C\uBB38", // Seo Moon
76        "\uC120\uC6B0", // Seon Woo
77        "\uC18C\uBD09", // So Bong
78        "\uC5B4\uAE08", // Uh Geum
79        "\uC7A5\uACE1", // Jang Gok
80        "\uC81C\uAC08", // Je Gal
81        "\uD669\uBCF4"  // Hwang Bo
82    };
83
84    public static class Name {
85        public String prefix;
86        public String givenNames;
87        public String middleName;
88        public String familyName;
89        public String suffix;
90
91        public int fullNameStyle;
92
93        public String phoneticFamilyName;
94        public String phoneticMiddleName;
95        public String phoneticGivenName;
96
97        public int phoneticNameStyle;
98
99        public Name() {
100        }
101
102        public Name(String prefix, String givenNames, String middleName, String familyName,
103                String suffix) {
104            this.prefix = prefix;
105            this.givenNames = givenNames;
106            this.middleName = middleName;
107            this.familyName = familyName;
108            this.suffix = suffix;
109        }
110
111        @NeededForTesting
112        public String getPrefix() {
113            return prefix;
114        }
115
116        public String getGivenNames() {
117            return givenNames;
118        }
119
120        public String getMiddleName() {
121            return middleName;
122        }
123
124        public String getFamilyName() {
125            return familyName;
126        }
127
128        @NeededForTesting
129        public String getSuffix() {
130            return suffix;
131        }
132
133        public int getFullNameStyle() {
134            return fullNameStyle;
135        }
136
137        public String getPhoneticFamilyName() {
138            return phoneticFamilyName;
139        }
140
141        public String getPhoneticMiddleName() {
142            return phoneticMiddleName;
143        }
144
145        public String getPhoneticGivenName() {
146            return phoneticGivenName;
147        }
148
149        public int getPhoneticNameStyle() {
150            return phoneticNameStyle;
151        }
152
153        public void fromValues(ContentValues values) {
154            prefix = values.getAsString(StructuredName.PREFIX);
155            givenNames = values.getAsString(StructuredName.GIVEN_NAME);
156            middleName = values.getAsString(StructuredName.MIDDLE_NAME);
157            familyName = values.getAsString(StructuredName.FAMILY_NAME);
158            suffix = values.getAsString(StructuredName.SUFFIX);
159
160            Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE);
161            fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer;
162
163            phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME);
164            phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME);
165            phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME);
166
167            integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE);
168            phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer;
169        }
170
171        public void toValues(ContentValues values) {
172            putValueIfPresent(values, StructuredName.PREFIX, prefix);
173            putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames);
174            putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName);
175            putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName);
176            putValueIfPresent(values, StructuredName.SUFFIX, suffix);
177            values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle);
178            putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName);
179            putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName);
180            putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName);
181            values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle);
182        }
183
184        private void putValueIfPresent(ContentValues values, String name, String value) {
185            if (value != null) {
186                values.put(name, value);
187            }
188        }
189
190        public void clear() {
191            prefix = null;
192            givenNames = null;
193            middleName = null;
194            familyName = null;
195            suffix = null;
196            fullNameStyle = FullNameStyle.UNDEFINED;
197            phoneticFamilyName = null;
198            phoneticMiddleName = null;
199            phoneticGivenName = null;
200            phoneticNameStyle = PhoneticNameStyle.UNDEFINED;
201        }
202
203        public boolean isEmpty() {
204            return TextUtils.isEmpty(givenNames)
205                    && TextUtils.isEmpty(middleName)
206                    && TextUtils.isEmpty(familyName)
207                    && TextUtils.isEmpty(suffix)
208                    && TextUtils.isEmpty(phoneticFamilyName)
209                    && TextUtils.isEmpty(phoneticMiddleName)
210                    && TextUtils.isEmpty(phoneticGivenName);
211        }
212
213        @Override
214        public String toString() {
215            return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName
216                    + " family: " + familyName + " suffix: " + suffix + " ph/given: "
217                    + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: "
218                    + phoneticFamilyName + "]";
219        }
220    }
221
222    private static class NameTokenizer extends StringTokenizer {
223        private final String[] mTokens;
224        private int mDotBitmask;
225        private int mCommaBitmask;
226        private int mStartPointer;
227        private int mEndPointer;
228
229        public NameTokenizer(String fullName) {
230            super(fullName, " .,", true);
231
232            mTokens = new String[MAX_TOKENS];
233
234            // Iterate over tokens, skipping over empty ones and marking tokens that
235            // are followed by dots.
236            while (hasMoreTokens() && mEndPointer < MAX_TOKENS) {
237                final String token = nextToken();
238                if (token.length() > 0) {
239                    final char c = token.charAt(0);
240                    if (c == ' ') {
241                        continue;
242                    }
243                }
244
245                if (mEndPointer > 0 && token.charAt(0) == '.') {
246                    mDotBitmask |= (1 << (mEndPointer - 1));
247                } else if (mEndPointer > 0 && token.charAt(0) == ',') {
248                    mCommaBitmask |= (1 << (mEndPointer - 1));
249                } else {
250                    mTokens[mEndPointer] = token;
251                    mEndPointer++;
252                }
253            }
254        }
255
256        /**
257         * Returns true if the token is followed by a dot in the original full name.
258         */
259        public boolean hasDot(int index) {
260            return (mDotBitmask & (1 << index)) != 0;
261        }
262
263        /**
264         * Returns true if the token is followed by a comma in the original full name.
265         */
266        public boolean hasComma(int index) {
267            return (mCommaBitmask & (1 << index)) != 0;
268        }
269    }
270
271    /**
272     * Constructor.
273     *
274     * @param commonPrefixes comma-separated list of common prefixes,
275     *            e.g. "Mr, Ms, Mrs"
276     * @param commonLastNamePrefixes comma-separated list of common last name prefixes,
277     *            e.g. "d', st, st., von"
278     * @param commonSuffixes comma-separated list of common suffixes,
279     *            e.g. "Jr, M.D., MD, D.D.S."
280     * @param commonConjunctions comma-separated list of common conjuctions,
281     *            e.g. "AND, Or"
282     */
283    public NameSplitter(String commonPrefixes, String commonLastNamePrefixes,
284            String commonSuffixes, String commonConjunctions, Locale locale) {
285        // TODO: refactor this to use <string-array> resources
286        mPrefixesSet = convertToSet(commonPrefixes);
287        mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes);
288        mSuffixesSet = convertToSet(commonSuffixes);
289        mConjuctions = convertToSet(commonConjunctions);
290        mLocale = locale != null ? locale : Locale.getDefault();
291        mLanguage = mLocale.getLanguage().toLowerCase();
292
293        int maxLength = 0;
294        for (String suffix : mSuffixesSet) {
295            if (suffix.length() > maxLength) {
296                maxLength = suffix.length();
297            }
298        }
299
300        mMaxSuffixLength = maxLength;
301    }
302
303    /**
304     * Converts a comma-separated list of Strings to a set of Strings. Trims strings
305     * and converts them to upper case.
306     */
307    private static HashSet<String> convertToSet(String strings) {
308        HashSet<String> set = new HashSet<String>();
309        if (strings != null) {
310            String[] split = strings.split(",");
311            for (int i = 0; i < split.length; i++) {
312                set.add(split[i].trim().toUpperCase());
313            }
314        }
315        return set;
316    }
317
318    /**
319     * Parses a full name and returns components as a list of tokens.
320     */
321    public int tokenize(String[] tokens, String fullName) {
322        if (fullName == null) {
323            return 0;
324        }
325
326        NameTokenizer tokenizer = new NameTokenizer(fullName);
327
328        if (tokenizer.mStartPointer == tokenizer.mEndPointer) {
329            return 0;
330        }
331
332        String firstToken = tokenizer.mTokens[tokenizer.mStartPointer];
333        int count = 0;
334        for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) {
335            tokens[count++] = tokenizer.mTokens[i];
336        }
337
338        return count;
339    }
340
341
342    /**
343     * Parses a full name and returns parsed components in the Name object.
344     */
345    public void split(Name name, String fullName) {
346        if (fullName == null) {
347            return;
348        }
349
350        int fullNameStyle = guessFullNameStyle(fullName);
351        if (fullNameStyle == FullNameStyle.CJK) {
352            fullNameStyle = getAdjustedFullNameStyle(fullNameStyle);
353        }
354
355        split(name, fullName, fullNameStyle);
356    }
357
358    /**
359     * Parses a full name and returns parsed components in the Name object
360     * with a given fullNameStyle.
361     */
362    public void split(Name name, String fullName, int fullNameStyle) {
363        if (fullName == null) {
364            return;
365        }
366
367        name.fullNameStyle = fullNameStyle;
368
369        switch (fullNameStyle) {
370            case FullNameStyle.CHINESE:
371                splitChineseName(name, fullName);
372                break;
373
374            case FullNameStyle.JAPANESE:
375                splitJapaneseName(name, fullName);
376                break;
377
378            case FullNameStyle.KOREAN:
379                splitKoreanName(name, fullName);
380                break;
381
382            default:
383                splitWesternName(name, fullName);
384        }
385    }
386
387    /**
388     * Splits a full name composed according to the Western tradition:
389     * <pre>
390     *   [prefix] given name(s) [[middle name] family name] [, suffix]
391     *   [prefix] family name, given name [middle name] [,suffix]
392     * </pre>
393     */
394    private void splitWesternName(Name name, String fullName) {
395        NameTokenizer tokens = new NameTokenizer(fullName);
396        parsePrefix(name, tokens);
397
398        // If the name consists of just one or two tokens, treat them as first/last name,
399        // not as suffix.  Example: John Ma; Ma is last name, not "M.A.".
400        if (tokens.mEndPointer > 2) {
401            parseSuffix(name, tokens);
402        }
403
404        if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) {
405            name.givenNames = tokens.mTokens[tokens.mStartPointer];
406        } else {
407            parseLastName(name, tokens);
408            parseMiddleName(name, tokens);
409            parseGivenNames(name, tokens);
410        }
411    }
412
413    /**
414     * Splits a full name composed according to the Chinese tradition:
415     * <pre>
416     *   [family name [middle name]] given name
417     * </pre>
418     */
419    private void splitChineseName(Name name, String fullName) {
420        StringTokenizer tokenizer = new StringTokenizer(fullName);
421        while (tokenizer.hasMoreTokens()) {
422            String token = tokenizer.nextToken();
423            if (name.givenNames == null) {
424                name.givenNames = token;
425            } else if (name.familyName == null) {
426                name.familyName = name.givenNames;
427                name.givenNames = token;
428            } else if (name.middleName == null) {
429                name.middleName = name.givenNames;
430                name.givenNames = token;
431            } else {
432                name.middleName = name.middleName + name.givenNames;
433                name.givenNames = token;
434            }
435        }
436
437        // If a single word parse that word up.
438        if (name.givenNames != null && name.familyName == null && name.middleName == null) {
439            int length = fullName.length();
440            if (length == 2) {
441                name.familyName = fullName.substring(0, 1);
442                name.givenNames = fullName.substring(1);
443            } else if (length == 3) {
444                name.familyName = fullName.substring(0, 1);
445                name.middleName = fullName.substring(1, 2);
446                name.givenNames = fullName.substring(2);
447            } else if (length == 4) {
448                name.familyName = fullName.substring(0, 2);
449                name.middleName = fullName.substring(2, 3);
450                name.givenNames = fullName.substring(3);
451            }
452
453        }
454    }
455
456    /**
457     * Splits a full name composed according to the Japanese tradition:
458     * <pre>
459     *   [family name] given name(s)
460     * </pre>
461     */
462    private void splitJapaneseName(Name name, String fullName) {
463        StringTokenizer tokenizer = new StringTokenizer(fullName);
464        while (tokenizer.hasMoreTokens()) {
465            String token = tokenizer.nextToken();
466            if (name.givenNames == null) {
467                name.givenNames = token;
468            } else if (name.familyName == null) {
469                name.familyName = name.givenNames;
470                name.givenNames = token;
471            } else {
472                name.givenNames += " " + token;
473            }
474        }
475    }
476
477    /**
478     * Splits a full name composed according to the Korean tradition:
479     * <pre>
480     *   [family name] given name(s)
481     * </pre>
482     */
483    private void splitKoreanName(Name name, String fullName) {
484        StringTokenizer tokenizer = new StringTokenizer(fullName);
485        if (tokenizer.countTokens() > 1) {
486            // Each name can be identified by separators.
487            while (tokenizer.hasMoreTokens()) {
488                String token = tokenizer.nextToken();
489                if (name.givenNames == null) {
490                    name.givenNames = token;
491                } else if (name.familyName == null) {
492                    name.familyName = name.givenNames;
493                    name.givenNames = token;
494                } else {
495                    name.givenNames += " " + token;
496                }
497            }
498        } else {
499            // There is no separator. Try to guess family name.
500            // The length of most family names is 1.
501            int familyNameLength = 1;
502
503            // Compare with 2-length family names.
504            for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) {
505                if (fullName.startsWith(twoLengthFamilyName)) {
506                    familyNameLength = 2;
507                    break;
508                }
509            }
510
511            name.familyName = fullName.substring(0, familyNameLength);
512            if (fullName.length() > familyNameLength) {
513                name.givenNames = fullName.substring(familyNameLength);
514            }
515        }
516    }
517
518    /**
519     * Concatenates components of a name according to the rules dictated by the name style.
520     *
521     * @param givenNameFirst is ignored for CJK display name styles
522     */
523    public String join(Name name, boolean givenNameFirst, boolean includePrefix) {
524        String prefix = includePrefix ? name.prefix : null;
525        switch (name.fullNameStyle) {
526            case FullNameStyle.CJK:
527            case FullNameStyle.CHINESE:
528            case FullNameStyle.KOREAN:
529                return join(prefix, name.familyName, name.middleName, name.givenNames,
530                        name.suffix, false, false, false);
531
532            case FullNameStyle.JAPANESE:
533                return join(prefix, name.familyName, name.middleName, name.givenNames,
534                        name.suffix, true, false, false);
535
536            default:
537                if (givenNameFirst) {
538                    return join(prefix, name.givenNames, name.middleName, name.familyName,
539                            name.suffix, true, false, true);
540                } else {
541                    return join(prefix, name.familyName, name.givenNames, name.middleName,
542                            name.suffix, true, true, true);
543                }
544        }
545    }
546
547    /**
548     * Concatenates components of the phonetic name following the CJK tradition:
549     * family name + middle name + given name(s).
550     */
551    public String joinPhoneticName(Name name) {
552        return join(null, name.phoneticFamilyName,
553                name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false);
554    }
555
556    /**
557     * Concatenates parts of a full name inserting spaces and commas as specified.
558     */
559    private String join(String prefix, String part1, String part2, String part3, String suffix,
560            boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) {
561        prefix = prefix == null ? null: prefix.trim();
562        part1 = part1 == null ? null: part1.trim();
563        part2 = part2 == null ? null: part2.trim();
564        part3 = part3 == null ? null: part3.trim();
565        suffix = suffix == null ? null: suffix.trim();
566
567        boolean hasPrefix = !TextUtils.isEmpty(prefix);
568        boolean hasPart1 = !TextUtils.isEmpty(part1);
569        boolean hasPart2 = !TextUtils.isEmpty(part2);
570        boolean hasPart3 = !TextUtils.isEmpty(part3);
571        boolean hasSuffix = !TextUtils.isEmpty(suffix);
572
573        boolean isSingleWord = true;
574        String singleWord = null;
575
576        if (hasPrefix) {
577            singleWord = prefix;
578        }
579
580        if (hasPart1) {
581            if (singleWord != null) {
582                isSingleWord = false;
583            } else {
584                singleWord = part1;
585            }
586        }
587
588        if (hasPart2) {
589            if (singleWord != null) {
590                isSingleWord = false;
591            } else {
592                singleWord = part2;
593            }
594        }
595
596        if (hasPart3) {
597            if (singleWord != null) {
598                isSingleWord = false;
599            } else {
600                singleWord = part3;
601            }
602        }
603
604        if (hasSuffix) {
605            if (singleWord != null) {
606                isSingleWord = false;
607            } else {
608                singleWord = normalizedSuffix(suffix);
609            }
610        }
611
612        if (isSingleWord) {
613            return singleWord;
614        }
615
616        StringBuilder sb = new StringBuilder();
617
618        if (hasPrefix) {
619            sb.append(prefix);
620        }
621
622        if (hasPart1) {
623            if (hasPrefix) {
624                sb.append(' ');
625            }
626            sb.append(part1);
627        }
628
629        if (hasPart2) {
630            if (hasPrefix || hasPart1) {
631                if (useCommaAfterPart1) {
632                    sb.append(',');
633                }
634                if (useSpace) {
635                    sb.append(' ');
636                }
637            }
638            sb.append(part2);
639        }
640
641        if (hasPart3) {
642            if (hasPrefix || hasPart1 || hasPart2) {
643                if (useSpace) {
644                    sb.append(' ');
645                }
646            }
647            sb.append(part3);
648        }
649
650        if (hasSuffix) {
651            if (hasPrefix || hasPart1 || hasPart2 || hasPart3) {
652                if (useCommaAfterPart3) {
653                    sb.append(',');
654                }
655                if (useSpace) {
656                    sb.append(' ');
657                }
658            }
659            sb.append(normalizedSuffix(suffix));
660        }
661
662        return sb.toString();
663    }
664
665    /**
666     * Puts a dot after the supplied suffix if that is the accepted form of the suffix,
667     * e.g. "Jr." and "Sr.", but not "I", "II" and "III".
668     */
669    private String normalizedSuffix(String suffix) {
670        int length = suffix.length();
671        if (length == 0 || suffix.charAt(length - 1) == '.') {
672            return suffix;
673        }
674
675        String withDot = suffix + '.';
676        if (mSuffixesSet.contains(withDot.toUpperCase())) {
677            return withDot;
678        } else {
679            return suffix;
680        }
681    }
682
683    /**
684     * If the supplied name style is undefined, returns a default based on the language,
685     * otherwise returns the supplied name style itself.
686     *
687     * @param nameStyle See {@link FullNameStyle}.
688     */
689    public int getAdjustedFullNameStyle(int nameStyle) {
690        if (nameStyle == FullNameStyle.UNDEFINED) {
691            if (JAPANESE_LANGUAGE.equals(mLanguage)) {
692                return FullNameStyle.JAPANESE;
693            } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
694                return FullNameStyle.KOREAN;
695            } else if (CHINESE_LANGUAGE.equals(mLanguage)) {
696                return FullNameStyle.CHINESE;
697            } else {
698                return FullNameStyle.WESTERN;
699            }
700        } else if (nameStyle == FullNameStyle.CJK) {
701            if (JAPANESE_LANGUAGE.equals(mLanguage)) {
702                return FullNameStyle.JAPANESE;
703            } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
704                return FullNameStyle.KOREAN;
705            } else {
706                return FullNameStyle.CHINESE;
707            }
708        }
709        return nameStyle;
710    }
711
712    /**
713     * Parses the first word from the name if it is a prefix.
714     */
715    private void parsePrefix(Name name, NameTokenizer tokens) {
716        if (tokens.mStartPointer == tokens.mEndPointer) {
717            return;
718        }
719
720        String firstToken = tokens.mTokens[tokens.mStartPointer];
721        if (mPrefixesSet.contains(firstToken.toUpperCase())) {
722            if (tokens.hasDot(tokens.mStartPointer)) {
723                firstToken += '.';
724            }
725            name.prefix = firstToken;
726            tokens.mStartPointer++;
727        }
728    }
729
730    /**
731     * Parses the last word(s) from the name if it is a suffix.
732     */
733    private void parseSuffix(Name name, NameTokenizer tokens) {
734        if (tokens.mStartPointer == tokens.mEndPointer) {
735            return;
736        }
737
738        String lastToken = tokens.mTokens[tokens.mEndPointer - 1];
739
740        // Take care of an explicit comma-separated suffix
741        if (tokens.mEndPointer - tokens.mStartPointer > 2
742                && tokens.hasComma(tokens.mEndPointer - 2)) {
743            if (tokens.hasDot(tokens.mEndPointer - 1)) {
744                lastToken += '.';
745            }
746            name.suffix = lastToken;
747            tokens.mEndPointer--;
748            return;
749        }
750
751        if (lastToken.length() > mMaxSuffixLength) {
752            return;
753        }
754
755        String normalized = lastToken.toUpperCase();
756        if (mSuffixesSet.contains(normalized)) {
757            name.suffix = lastToken;
758            tokens.mEndPointer--;
759            return;
760        }
761
762        if (tokens.hasDot(tokens.mEndPointer - 1)) {
763            lastToken += '.';
764        }
765        normalized += ".";
766
767        // Take care of suffixes like M.D. and D.D.S.
768        int pos = tokens.mEndPointer - 1;
769        while (normalized.length() <= mMaxSuffixLength) {
770
771            if (mSuffixesSet.contains(normalized)) {
772                name.suffix = lastToken;
773                tokens.mEndPointer = pos;
774                return;
775            }
776
777            if (pos == tokens.mStartPointer) {
778                break;
779            }
780
781            pos--;
782            if (tokens.hasDot(pos)) {
783                lastToken = tokens.mTokens[pos] + "." + lastToken;
784            } else {
785                lastToken = tokens.mTokens[pos] + " " + lastToken;
786            }
787
788            normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized;
789        }
790    }
791
792    private void parseLastName(Name name, NameTokenizer tokens) {
793        if (tokens.mStartPointer == tokens.mEndPointer) {
794            return;
795        }
796
797        // If the first word is followed by a comma, assume that it's the family name
798        if (tokens.hasComma(tokens.mStartPointer)) {
799           name.familyName = tokens.mTokens[tokens.mStartPointer];
800           tokens.mStartPointer++;
801           return;
802        }
803
804        // If the second word is followed by a comma and the first word
805        // is a last name prefix as in "de Sade" and "von Cliburn", treat
806        // the first two words as the family name.
807        if (tokens.mStartPointer + 1 < tokens.mEndPointer
808                && tokens.hasComma(tokens.mStartPointer + 1)
809                && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) {
810            String familyNamePrefix = tokens.mTokens[tokens.mStartPointer];
811            if (tokens.hasDot(tokens.mStartPointer)) {
812                familyNamePrefix += '.';
813            }
814            name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1];
815            tokens.mStartPointer += 2;
816            return;
817        }
818
819        // Finally, assume that the last word is the last name
820        name.familyName = tokens.mTokens[tokens.mEndPointer - 1];
821        tokens.mEndPointer--;
822
823        // Take care of last names like "de Sade" and "von Cliburn"
824        if ((tokens.mEndPointer - tokens.mStartPointer) > 0) {
825            String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1];
826            if (isFamilyNamePrefix(lastNamePrefix)) {
827                if (tokens.hasDot(tokens.mEndPointer - 1)) {
828                    lastNamePrefix += '.';
829                }
830                name.familyName = lastNamePrefix + " " + name.familyName;
831                tokens.mEndPointer--;
832            }
833        }
834    }
835
836    /**
837     * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de"
838     */
839    private boolean isFamilyNamePrefix(String word) {
840        final String normalized = word.toUpperCase();
841
842        return mLastNamePrefixesSet.contains(normalized)
843                || mLastNamePrefixesSet.contains(normalized + ".");
844    }
845
846
847    private void parseMiddleName(Name name, NameTokenizer tokens) {
848        if (tokens.mStartPointer == tokens.mEndPointer) {
849            return;
850        }
851
852        if ((tokens.mEndPointer - tokens.mStartPointer) > 1) {
853            if ((tokens.mEndPointer - tokens.mStartPointer) == 2
854                    || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2].
855                            toUpperCase())) {
856                name.middleName = tokens.mTokens[tokens.mEndPointer - 1];
857                if (tokens.hasDot(tokens.mEndPointer - 1)) {
858                    name.middleName += '.';
859                }
860                tokens.mEndPointer--;
861            }
862        }
863    }
864
865    private void parseGivenNames(Name name, NameTokenizer tokens) {
866        if (tokens.mStartPointer == tokens.mEndPointer) {
867            return;
868        }
869
870        if ((tokens.mEndPointer - tokens.mStartPointer) == 1) {
871            name.givenNames = tokens.mTokens[tokens.mStartPointer];
872        } else {
873            StringBuilder sb = new StringBuilder();
874            for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) {
875                if (i != tokens.mStartPointer) {
876                    sb.append(' ');
877                }
878                sb.append(tokens.mTokens[i]);
879                if (tokens.hasDot(i)) {
880                    sb.append('.');
881                }
882            }
883            name.givenNames = sb.toString();
884        }
885    }
886
887    /**
888     * Makes the best guess at the expected full name style based on the character set
889     * used in the supplied name.  If the phonetic name is also supplied, tries to
890     * differentiate between Chinese, Japanese and Korean based on the alphabet used
891     * for the phonetic name.
892     */
893    public void guessNameStyle(Name name) {
894        guessFullNameStyle(name);
895        guessPhoneticNameStyle(name);
896        name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle,
897                name.phoneticNameStyle);
898    }
899
900    /**
901     * Updates the display name style according to the phonetic name style if we
902     * were unsure about display name style based on the name components, but
903     * phonetic name makes it more definitive.
904     */
905    public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) {
906        if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
907            if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) {
908                if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) {
909                    return FullNameStyle.JAPANESE;
910                } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) {
911                    return FullNameStyle.KOREAN;
912                }
913                if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) {
914                    return FullNameStyle.CHINESE;
915                }
916            }
917        }
918        return nameStyle;
919    }
920
921    /**
922     * Makes the best guess at the expected full name style based on the character set
923     * used in the supplied name.
924     */
925    private void guessFullNameStyle(NameSplitter.Name name) {
926        if (name.fullNameStyle != FullNameStyle.UNDEFINED) {
927            return;
928        }
929
930        int bestGuess = guessFullNameStyle(name.givenNames);
931        // A mix of Hanzi and latin chars are common in China, so we have to go through all names
932        // if the name is not JANPANESE or KOREAN.
933        if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK
934                && bestGuess != FullNameStyle.WESTERN) {
935            name.fullNameStyle = bestGuess;
936            return;
937        }
938
939        int guess = guessFullNameStyle(name.familyName);
940        if (guess != FullNameStyle.UNDEFINED) {
941            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
942                name.fullNameStyle = guess;
943                return;
944            }
945            bestGuess = guess;
946        }
947
948        guess = guessFullNameStyle(name.middleName);
949        if (guess != FullNameStyle.UNDEFINED) {
950            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
951                name.fullNameStyle = guess;
952                return;
953            }
954            bestGuess = guess;
955        }
956
957        guess = guessFullNameStyle(name.prefix);
958        if (guess != FullNameStyle.UNDEFINED) {
959            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
960                name.fullNameStyle = guess;
961                return;
962            }
963            bestGuess = guess;
964        }
965
966        guess = guessFullNameStyle(name.suffix);
967        if (guess != FullNameStyle.UNDEFINED) {
968            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
969                name.fullNameStyle = guess;
970                return;
971            }
972            bestGuess = guess;
973        }
974
975        name.fullNameStyle = bestGuess;
976    }
977
978    public int guessFullNameStyle(String name) {
979        if (name == null) {
980            return FullNameStyle.UNDEFINED;
981        }
982
983        int nameStyle = FullNameStyle.UNDEFINED;
984        int length = name.length();
985        int offset = 0;
986        while (offset < length) {
987            int codePoint = Character.codePointAt(name, offset);
988            if (Character.isLetter(codePoint)) {
989                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
990
991                if (!isLatinUnicodeBlock(unicodeBlock)) {
992
993                    if (isCJKUnicodeBlock(unicodeBlock)) {
994                        // We don't know if this is Chinese, Japanese or Korean -
995                        // trying to figure out by looking at other characters in the name
996                        return guessCJKNameStyle(name, offset + Character.charCount(codePoint));
997                    }
998
999                    if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1000                        return FullNameStyle.JAPANESE;
1001                    }
1002
1003                    if (isKoreanUnicodeBlock(unicodeBlock)) {
1004                        return FullNameStyle.KOREAN;
1005                    }
1006                }
1007                nameStyle = FullNameStyle.WESTERN;
1008            }
1009            offset += Character.charCount(codePoint);
1010        }
1011        return nameStyle;
1012    }
1013
1014    private int guessCJKNameStyle(String name, int offset) {
1015        int length = name.length();
1016        while (offset < length) {
1017            int codePoint = Character.codePointAt(name, offset);
1018            if (Character.isLetter(codePoint)) {
1019                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
1020                if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1021                    return FullNameStyle.JAPANESE;
1022                }
1023                if (isKoreanUnicodeBlock(unicodeBlock)) {
1024                    return FullNameStyle.KOREAN;
1025                }
1026            }
1027            offset += Character.charCount(codePoint);
1028        }
1029
1030        return FullNameStyle.CJK;
1031    }
1032
1033    private void guessPhoneticNameStyle(NameSplitter.Name name) {
1034        if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
1035            return;
1036        }
1037
1038        int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName);
1039        if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) {
1040            name.phoneticNameStyle = bestGuess;
1041            return;
1042        }
1043
1044        int guess = guessPhoneticNameStyle(name.phoneticGivenName);
1045        if (guess != FullNameStyle.UNDEFINED) {
1046            if (guess != FullNameStyle.CJK) {
1047                name.phoneticNameStyle = guess;
1048                return;
1049            }
1050            bestGuess = guess;
1051        }
1052
1053        guess = guessPhoneticNameStyle(name.phoneticMiddleName);
1054        if (guess != FullNameStyle.UNDEFINED) {
1055            if (guess != FullNameStyle.CJK) {
1056                name.phoneticNameStyle = guess;
1057                return;
1058            }
1059            bestGuess = guess;
1060        }
1061    }
1062
1063    public int guessPhoneticNameStyle(String name) {
1064        if (name == null) {
1065            return PhoneticNameStyle.UNDEFINED;
1066        }
1067
1068        int nameStyle = PhoneticNameStyle.UNDEFINED;
1069        int length = name.length();
1070        int offset = 0;
1071        while (offset < length) {
1072            int codePoint = Character.codePointAt(name, offset);
1073            if (Character.isLetter(codePoint)) {
1074                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
1075                if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
1076                    return PhoneticNameStyle.JAPANESE;
1077                }
1078                if (isKoreanUnicodeBlock(unicodeBlock)) {
1079                    return PhoneticNameStyle.KOREAN;
1080                }
1081                if (isLatinUnicodeBlock(unicodeBlock)) {
1082                    return PhoneticNameStyle.PINYIN;
1083                }
1084            }
1085            offset += Character.charCount(codePoint);
1086        }
1087
1088        return nameStyle;
1089    }
1090
1091    private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) {
1092        return unicodeBlock == UnicodeBlock.BASIC_LATIN ||
1093                unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT ||
1094                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A ||
1095                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B ||
1096                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL;
1097    }
1098
1099    private static boolean isCJKUnicodeBlock(UnicodeBlock block) {
1100        return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
1101                || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
1102                || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
1103                || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
1104                || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT
1105                || block == UnicodeBlock.CJK_COMPATIBILITY
1106                || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS
1107                || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
1108                || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT;
1109    }
1110
1111    private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) {
1112        return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES ||
1113                unicodeBlock == UnicodeBlock.HANGUL_JAMO ||
1114                unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO;
1115    }
1116
1117    private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) {
1118        return unicodeBlock == UnicodeBlock.KATAKANA ||
1119                unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS ||
1120                unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS ||
1121                unicodeBlock == UnicodeBlock.HIRAGANA;
1122    }
1123}
1124