providers/contacts/NameSplitter.java

/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License
 */
package com.android.providers.contacts;

import android.content.ContentValues;
import android.provider.ContactsContract.CommonDataKinds.StructuredName;
import android.provider.ContactsContract.FullNameStyle;
import android.provider.ContactsContract.PhoneticNameStyle;
import android.text.TextUtils;

import com.android.providers.contacts.util.NeededForTesting;

import java.lang.Character.UnicodeBlock;
import java.util.HashSet;
import java.util.Locale;
import java.util.StringTokenizer;

/**
 * The purpose of this class is to split a full name into given names and last
 * name. The logic only supports having a single last name. If the full name has
 * multiple last names the output will be incorrect.
 * <p>
 * Core algorithm:
 * <ol>
 * <li>Remove the suffixes (III, Ph.D., M.D.).</li>
 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li>
 * <li>Assign the last remaining token as the last name.</li>
 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use
 * this word also as the last name.</li>
 * <li>Assign the rest of the words as the "given names".</li>
 * </ol>
 */
public class NameSplitter {

    public static final int MAX_TOKENS = 10;

    private static final String JAPANESE_LANGUAGE = Locale.JAPANESE.getLanguage().toLowerCase();
    private static final String KOREAN_LANGUAGE = Locale.KOREAN.getLanguage().toLowerCase();

    // This includes simplified and traditional Chinese
    private static final String CHINESE_LANGUAGE = Locale.CHINESE.getLanguage().toLowerCase();

    private final HashSet<String> mPrefixesSet;
    private final HashSet<String> mSuffixesSet;
    private final int mMaxSuffixLength;
    private final HashSet<String> mLastNamePrefixesSet;
    private final HashSet<String> mConjuctions;
    private final Locale mLocale;
    private final String mLanguage;

    /**
     * Two-Chracter long Korean family names.
     * http://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EB%B3%B5%EC%84%B1
     */
    private static final String[] KOREAN_TWO_CHARCTER_FAMILY_NAMES = {
        "\uAC15\uC804", // Gang Jeon
        "\uB0A8\uAD81", // Nam Goong
        "\uB3C5\uACE0", // Dok Go
        "\uB3D9\uBC29", // Dong Bang
        "\uB9DD\uC808", // Mang Jeol
        "\uC0AC\uACF5", // Sa Gong
        "\uC11C\uBB38", // Seo Moon
        "\uC120\uC6B0", // Seon Woo
        "\uC18C\uBD09", // So Bong
        "\uC5B4\uAE08", // Uh Geum
        "\uC7A5\uACE1", // Jang Gok
        "\uC81C\uAC08", // Je Gal
        "\uD669\uBCF4"  // Hwang Bo
    };

    public static class Name {
        public String prefix;
        public String givenNames;
        public String middleName;
        public String familyName;
        public String suffix;

        public int fullNameStyle;

        public String phoneticFamilyName;
        public String phoneticMiddleName;
        public String phoneticGivenName;

        public int phoneticNameStyle;

        public Name() {
        }

        public Name(String prefix, String givenNames, String middleName, String familyName,
                String suffix) {
            this.prefix = prefix;
            this.givenNames = givenNames;
            this.middleName = middleName;
            this.familyName = familyName;
            this.suffix = suffix;
        }

        @NeededForTesting
        public String getPrefix() {
            return prefix;
        }

        public String getGivenNames() {
            return givenNames;
        }

        public String getMiddleName() {
            return middleName;
        }

        public String getFamilyName() {
            return familyName;
        }

        @NeededForTesting
        public String getSuffix() {
            return suffix;
        }

        public int getFullNameStyle() {
            return fullNameStyle;
        }

        public String getPhoneticFamilyName() {
            return phoneticFamilyName;
        }

        public String getPhoneticMiddleName() {
            return phoneticMiddleName;
        }

        public String getPhoneticGivenName() {
            return phoneticGivenName;
        }

        public int getPhoneticNameStyle() {
            return phoneticNameStyle;
        }

        public void fromValues(ContentValues values) {
            prefix = values.getAsString(StructuredName.PREFIX);
            givenNames = values.getAsString(StructuredName.GIVEN_NAME);
            middleName = values.getAsString(StructuredName.MIDDLE_NAME);
            familyName = values.getAsString(StructuredName.FAMILY_NAME);
            suffix = values.getAsString(StructuredName.SUFFIX);

            Integer integer = values.getAsInteger(StructuredName.FULL_NAME_STYLE);
            fullNameStyle = integer == null ? FullNameStyle.UNDEFINED : integer;

            phoneticFamilyName = values.getAsString(StructuredName.PHONETIC_FAMILY_NAME);
            phoneticMiddleName = values.getAsString(StructuredName.PHONETIC_MIDDLE_NAME);
            phoneticGivenName = values.getAsString(StructuredName.PHONETIC_GIVEN_NAME);

            integer = values.getAsInteger(StructuredName.PHONETIC_NAME_STYLE);
            phoneticNameStyle = integer == null ? PhoneticNameStyle.UNDEFINED : integer;
        }

        public void toValues(ContentValues values) {
            putValueIfPresent(values, StructuredName.PREFIX, prefix);
            putValueIfPresent(values, StructuredName.GIVEN_NAME, givenNames);
            putValueIfPresent(values, StructuredName.MIDDLE_NAME, middleName);
            putValueIfPresent(values, StructuredName.FAMILY_NAME, familyName);
            putValueIfPresent(values, StructuredName.SUFFIX, suffix);
            values.put(StructuredName.FULL_NAME_STYLE, fullNameStyle);
            putValueIfPresent(values, StructuredName.PHONETIC_FAMILY_NAME, phoneticFamilyName);
            putValueIfPresent(values, StructuredName.PHONETIC_MIDDLE_NAME, phoneticMiddleName);
            putValueIfPresent(values, StructuredName.PHONETIC_GIVEN_NAME, phoneticGivenName);
            values.put(StructuredName.PHONETIC_NAME_STYLE, phoneticNameStyle);
        }

        private void putValueIfPresent(ContentValues values, String name, String value) {
            if (value != null) {
                values.put(name, value);
            }
        }

        public void clear() {
            prefix = null;
            givenNames = null;
            middleName = null;
            familyName = null;
            suffix = null;
            fullNameStyle = FullNameStyle.UNDEFINED;
            phoneticFamilyName = null;
            phoneticMiddleName = null;
            phoneticGivenName = null;
            phoneticNameStyle = PhoneticNameStyle.UNDEFINED;
        }

        public boolean isEmpty() {
            return TextUtils.isEmpty(givenNames)
                    && TextUtils.isEmpty(middleName)
                    && TextUtils.isEmpty(familyName)
                    && TextUtils.isEmpty(suffix)
                    && TextUtils.isEmpty(phoneticFamilyName)
                    && TextUtils.isEmpty(phoneticMiddleName)
                    && TextUtils.isEmpty(phoneticGivenName);
        }

        @Override
        public String toString() {
            return "[prefix: " + prefix + " given: " + givenNames + " middle: " + middleName
                    + " family: " + familyName + " suffix: " + suffix + " ph/given: "
                    + phoneticGivenName + " ph/middle: " + phoneticMiddleName + " ph/family: "
                    + phoneticFamilyName + "]";
        }
    }

    private static class NameTokenizer extends StringTokenizer {
        private final String[] mTokens;
        private int mDotBitmask;
        private int mCommaBitmask;
        private int mStartPointer;
        private int mEndPointer;

        public NameTokenizer(String fullName) {
            super(fullName, " .,", true);

            mTokens = new String[MAX_TOKENS];

            // Iterate over tokens, skipping over empty ones and marking tokens that
            // are followed by dots.
            while (hasMoreTokens() && mEndPointer < MAX_TOKENS) {
                final String token = nextToken();
                if (token.length() > 0) {
                    final char c = token.charAt(0);
                    if (c == ' ') {
                        continue;
                    }
                }

                if (mEndPointer > 0 && token.charAt(0) == '.') {
                    mDotBitmask |= (1 << (mEndPointer - 1));
                } else if (mEndPointer > 0 && token.charAt(0) == ',') {
                    mCommaBitmask |= (1 << (mEndPointer - 1));
                } else {
                    mTokens[mEndPointer] = token;
                    mEndPointer++;
                }
            }
        }

        /**
         * Returns true if the token is followed by a dot in the original full name.
         */
        public boolean hasDot(int index) {
            return (mDotBitmask & (1 << index)) != 0;
        }

        /**
         * Returns true if the token is followed by a comma in the original full name.
         */
        public boolean hasComma(int index) {
            return (mCommaBitmask & (1 << index)) != 0;
        }
    }

    /**
     * Constructor.
     *
     * @param commonPrefixes comma-separated list of common prefixes,
     *            e.g. "Mr, Ms, Mrs"
     * @param commonLastNamePrefixes comma-separated list of common last name prefixes,
     *            e.g. "d', st, st., von"
     * @param commonSuffixes comma-separated list of common suffixes,
     *            e.g. "Jr, M.D., MD, D.D.S."
     * @param commonConjunctions comma-separated list of common conjuctions,
     *            e.g. "AND, Or"
     */
    public NameSplitter(String commonPrefixes, String commonLastNamePrefixes,
            String commonSuffixes, String commonConjunctions, Locale locale) {
        // TODO: refactor this to use <string-array> resources
        mPrefixesSet = convertToSet(commonPrefixes);
        mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes);
        mSuffixesSet = convertToSet(commonSuffixes);
        mConjuctions = convertToSet(commonConjunctions);
        mLocale = locale != null ? locale : Locale.getDefault();
        mLanguage = mLocale.getLanguage().toLowerCase();

        int maxLength = 0;
        for (String suffix : mSuffixesSet) {
            if (suffix.length() > maxLength) {
                maxLength = suffix.length();
            }
        }

        mMaxSuffixLength = maxLength;
    }

    /**
     * Converts a comma-separated list of Strings to a set of Strings. Trims strings
     * and converts them to upper case.
     */
    private static HashSet<String> convertToSet(String strings) {
        HashSet<String> set = new HashSet<String>();
        if (strings != null) {
            String[] split = strings.split(",");
            for (int i = 0; i < split.length; i++) {
                set.add(split[i].trim().toUpperCase());
            }
        }
        return set;
    }

    /**
     * Parses a full name and returns components as a list of tokens.
     */
    public int tokenize(String[] tokens, String fullName) {
        if (fullName == null) {
            return 0;
        }

        NameTokenizer tokenizer = new NameTokenizer(fullName);

        if (tokenizer.mStartPointer == tokenizer.mEndPointer) {
            return 0;
        }

        String firstToken = tokenizer.mTokens[tokenizer.mStartPointer];
        if (mPrefixesSet.contains(firstToken.toUpperCase())) {
           tokenizer.mStartPointer++;
        }
        int count = 0;
        for (int i = tokenizer.mStartPointer; i < tokenizer.mEndPointer; i++) {
            tokens[count++] = tokenizer.mTokens[i];
        }

        return count;
    }


    /**
     * Parses a full name and returns parsed components in the Name object.
     */
    public void split(Name name, String fullName) {
        if (fullName == null) {
            return;
        }

        int fullNameStyle = guessFullNameStyle(fullName);
        if (fullNameStyle == FullNameStyle.CJK) {
            fullNameStyle = getAdjustedFullNameStyle(fullNameStyle);
        }

        split(name, fullName, fullNameStyle);
    }

    /**
     * Parses a full name and returns parsed components in the Name object
     * with a given fullNameStyle.
     */
    public void split(Name name, String fullName, int fullNameStyle) {
        if (fullName == null) {
            return;
        }

        name.fullNameStyle = fullNameStyle;

        switch (fullNameStyle) {
            case FullNameStyle.CHINESE:
                splitChineseName(name, fullName);
                break;

            case FullNameStyle.JAPANESE:
                splitJapaneseName(name, fullName);
                break;

            case FullNameStyle.KOREAN:
                splitKoreanName(name, fullName);
                break;

            default:
                splitWesternName(name, fullName);
        }
    }

    /**
     * Splits a full name composed according to the Western tradition:
     * <pre>
     *   [prefix] given name(s) [[middle name] family name] [, suffix]
     *   [prefix] family name, given name [middle name] [,suffix]
     * </pre>
     */
    private void splitWesternName(Name name, String fullName) {
        NameTokenizer tokens = new NameTokenizer(fullName);
        parsePrefix(name, tokens);

        // If the name consists of just one or two tokens, treat them as first/last name,
        // not as suffix.  Example: John Ma; Ma is last name, not "M.A.".
        if (tokens.mEndPointer > 2) {
            parseSuffix(name, tokens);
        }

        if (name.prefix == null && tokens.mEndPointer - tokens.mStartPointer == 1) {
            name.givenNames = tokens.mTokens[tokens.mStartPointer];
        } else {
            parseLastName(name, tokens);
            parseMiddleName(name, tokens);
            parseGivenNames(name, tokens);
        }
    }

    /**
     * Splits a full name composed according to the Chinese tradition:
     * <pre>
     *   [family name [middle name]] given name
     * </pre>
     */
    private void splitChineseName(Name name, String fullName) {
        StringTokenizer tokenizer = new StringTokenizer(fullName);
        while (tokenizer.hasMoreTokens()) {
            String token = tokenizer.nextToken();
            if (name.givenNames == null) {
                name.givenNames = token;
            } else if (name.familyName == null) {
                name.familyName = name.givenNames;
                name.givenNames = token;
            } else if (name.middleName == null) {
                name.middleName = name.givenNames;
                name.givenNames = token;
            } else {
                name.middleName = name.middleName + name.givenNames;
                name.givenNames = token;
            }
        }

        // If a single word parse that word up.
        if (name.givenNames != null && name.familyName == null && name.middleName == null) {
            int length = fullName.length();
            if (length == 2) {
                name.familyName = fullName.substring(0, 1);
                name.givenNames = fullName.substring(1);
            } else if (length == 3) {
                name.familyName = fullName.substring(0, 1);
                name.middleName = fullName.substring(1, 2);
                name.givenNames = fullName.substring(2);
            } else if (length == 4) {
                name.familyName = fullName.substring(0, 2);
                name.middleName = fullName.substring(2, 3);
                name.givenNames = fullName.substring(3);
            }

        }
    }

    /**
     * Splits a full name composed according to the Japanese tradition:
     * <pre>
     *   [family name] given name(s)
     * </pre>
     */
    private void splitJapaneseName(Name name, String fullName) {
        StringTokenizer tokenizer = new StringTokenizer(fullName);
        while (tokenizer.hasMoreTokens()) {
            String token = tokenizer.nextToken();
            if (name.givenNames == null) {
                name.givenNames = token;
            } else if (name.familyName == null) {
                name.familyName = name.givenNames;
                name.givenNames = token;
            } else {
                name.givenNames += " " + token;
            }
        }
    }

    /**
     * Splits a full name composed according to the Korean tradition:
     * <pre>
     *   [family name] given name(s)
     * </pre>
     */
    private void splitKoreanName(Name name, String fullName) {
        StringTokenizer tokenizer = new StringTokenizer(fullName);
        if (tokenizer.countTokens() > 1) {
            // Each name can be identified by separators.
            while (tokenizer.hasMoreTokens()) {
                String token = tokenizer.nextToken();
                if (name.givenNames == null) {
                    name.givenNames = token;
                } else if (name.familyName == null) {
                    name.familyName = name.givenNames;
                    name.givenNames = token;
                } else {
                    name.givenNames += " " + token;
                }
            }
        } else {
            // There is no separator. Try to guess family name.
            // The length of most family names is 1.
            int familyNameLength = 1;

            // Compare with 2-length family names.
            for (String twoLengthFamilyName : KOREAN_TWO_CHARCTER_FAMILY_NAMES) {
                if (fullName.startsWith(twoLengthFamilyName)) {
                    familyNameLength = 2;
                    break;
                }
            }

            name.familyName = fullName.substring(0, familyNameLength);
            if (fullName.length() > familyNameLength) {
                name.givenNames = fullName.substring(familyNameLength);
            }
        }
    }

    /**
     * Concatenates components of a name according to the rules dictated by the name style.
     *
     * @param givenNameFirst is ignored for CJK display name styles
     */
    public String join(Name name, boolean givenNameFirst, boolean includePrefix) {
        String prefix = includePrefix ? name.prefix : null;
        switch (name.fullNameStyle) {
            case FullNameStyle.CJK:
            case FullNameStyle.CHINESE:
            case FullNameStyle.KOREAN:
                return join(prefix, name.familyName, name.middleName, name.givenNames,
                        name.suffix, false, false, false);

            case FullNameStyle.JAPANESE:
                return join(prefix, name.familyName, name.middleName, name.givenNames,
                        name.suffix, true, false, false);

            default:
                if (givenNameFirst) {
                    return join(prefix, name.givenNames, name.middleName, name.familyName,
                            name.suffix, true, false, true);
                } else {
                    return join(prefix, name.familyName, name.givenNames, name.middleName,
                            name.suffix, true, true, true);
                }
        }
    }

    /**
     * Concatenates components of the phonetic name following the CJK tradition:
     * family name + middle name + given name(s).
     */
    public String joinPhoneticName(Name name) {
        return join(null, name.phoneticFamilyName,
                name.phoneticMiddleName, name.phoneticGivenName, null, true, false, false);
    }

    /**
     * Concatenates parts of a full name inserting spaces and commas as specified.
     */
    private String join(String prefix, String part1, String part2, String part3, String suffix,
            boolean useSpace, boolean useCommaAfterPart1, boolean useCommaAfterPart3) {
        prefix = prefix == null ? null: prefix.trim();
        part1 = part1 == null ? null: part1.trim();
        part2 = part2 == null ? null: part2.trim();
        part3 = part3 == null ? null: part3.trim();
        suffix = suffix == null ? null: suffix.trim();

        boolean hasPrefix = !TextUtils.isEmpty(prefix);
        boolean hasPart1 = !TextUtils.isEmpty(part1);
        boolean hasPart2 = !TextUtils.isEmpty(part2);
        boolean hasPart3 = !TextUtils.isEmpty(part3);
        boolean hasSuffix = !TextUtils.isEmpty(suffix);

        boolean isSingleWord = true;
        String singleWord = null;

        if (hasPrefix) {
            singleWord = prefix;
        }

        if (hasPart1) {
            if (singleWord != null) {
                isSingleWord = false;
            } else {
                singleWord = part1;
            }
        }

        if (hasPart2) {
            if (singleWord != null) {
                isSingleWord = false;
            } else {
                singleWord = part2;
            }
        }

        if (hasPart3) {
            if (singleWord != null) {
                isSingleWord = false;
            } else {
                singleWord = part3;
            }
        }

        if (hasSuffix) {
            if (singleWord != null) {
                isSingleWord = false;
            } else {
                singleWord = normalizedSuffix(suffix);
            }
        }

        if (isSingleWord) {
            return singleWord;
        }

        StringBuilder sb = new StringBuilder();

        if (hasPrefix) {
            sb.append(prefix);
        }

        if (hasPart1) {
            if (hasPrefix) {
                sb.append(' ');
            }
            sb.append(part1);
        }

        if (hasPart2) {
            if (hasPrefix || hasPart1) {
                if (useCommaAfterPart1) {
                    sb.append(',');
                }
                if (useSpace) {
                    sb.append(' ');
                }
            }
            sb.append(part2);
        }

        if (hasPart3) {
            if (hasPrefix || hasPart1 || hasPart2) {
                if (useSpace) {
                    sb.append(' ');
                }
            }
            sb.append(part3);
        }

        if (hasSuffix) {
            if (hasPrefix || hasPart1 || hasPart2 || hasPart3) {
                if (useCommaAfterPart3) {
                    sb.append(',');
                }
                if (useSpace) {
                    sb.append(' ');
                }
            }
            sb.append(normalizedSuffix(suffix));
        }

        return sb.toString();
    }

    /**
     * Puts a dot after the supplied suffix if that is the accepted form of the suffix,
     * e.g. "Jr." and "Sr.", but not "I", "II" and "III".
     */
    private String normalizedSuffix(String suffix) {
        int length = suffix.length();
        if (length == 0 || suffix.charAt(length - 1) == '.') {
            return suffix;
        }

        String withDot = suffix + '.';
        if (mSuffixesSet.contains(withDot.toUpperCase())) {
            return withDot;
        } else {
            return suffix;
        }
    }

    /**
     * If the supplied name style is undefined, returns a default based on the language,
     * otherwise returns the supplied name style itself.
     *
     * @param nameStyle See {@link FullNameStyle}.
     */
    public int getAdjustedFullNameStyle(int nameStyle) {
        if (nameStyle == FullNameStyle.UNDEFINED) {
            if (JAPANESE_LANGUAGE.equals(mLanguage)) {
                return FullNameStyle.JAPANESE;
            } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
                return FullNameStyle.KOREAN;
            } else if (CHINESE_LANGUAGE.equals(mLanguage)) {
                return FullNameStyle.CHINESE;
            } else {
                return FullNameStyle.WESTERN;
            }
        } else if (nameStyle == FullNameStyle.CJK) {
            if (JAPANESE_LANGUAGE.equals(mLanguage)) {
                return FullNameStyle.JAPANESE;
            } else if (KOREAN_LANGUAGE.equals(mLanguage)) {
                return FullNameStyle.KOREAN;
            } else {
                return FullNameStyle.CHINESE;
            }
        }
        return nameStyle;
    }

    /**
     * Parses the first word from the name if it is a prefix.
     */
    private void parsePrefix(Name name, NameTokenizer tokens) {
        if (tokens.mStartPointer == tokens.mEndPointer) {
            return;
        }

        String firstToken = tokens.mTokens[tokens.mStartPointer];
        if (mPrefixesSet.contains(firstToken.toUpperCase())) {
            if (tokens.hasDot(tokens.mStartPointer)) {
                firstToken += '.';
            }
            name.prefix = firstToken;
            tokens.mStartPointer++;
        }
    }

    /**
     * Parses the last word(s) from the name if it is a suffix.
     */
    private void parseSuffix(Name name, NameTokenizer tokens) {
        if (tokens.mStartPointer == tokens.mEndPointer) {
            return;
        }

        String lastToken = tokens.mTokens[tokens.mEndPointer - 1];

        // Take care of an explicit comma-separated suffix
        if (tokens.mEndPointer - tokens.mStartPointer > 2
                && tokens.hasComma(tokens.mEndPointer - 2)) {
            if (tokens.hasDot(tokens.mEndPointer - 1)) {
                lastToken += '.';
            }
            name.suffix = lastToken;
            tokens.mEndPointer--;
            return;
        }

        if (lastToken.length() > mMaxSuffixLength) {
            return;
        }

        String normalized = lastToken.toUpperCase();
        if (mSuffixesSet.contains(normalized)) {
            name.suffix = lastToken;
            tokens.mEndPointer--;
            return;
        }

        if (tokens.hasDot(tokens.mEndPointer - 1)) {
            lastToken += '.';
        }
        normalized += ".";

        // Take care of suffixes like M.D. and D.D.S.
        int pos = tokens.mEndPointer - 1;
        while (normalized.length() <= mMaxSuffixLength) {

            if (mSuffixesSet.contains(normalized)) {
                name.suffix = lastToken;
                tokens.mEndPointer = pos;
                return;
            }

            if (pos == tokens.mStartPointer) {
                break;
            }

            pos--;
            if (tokens.hasDot(pos)) {
                lastToken = tokens.mTokens[pos] + "." + lastToken;
            } else {
                lastToken = tokens.mTokens[pos] + " " + lastToken;
            }

            normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized;
        }
    }

    private void parseLastName(Name name, NameTokenizer tokens) {
        if (tokens.mStartPointer == tokens.mEndPointer) {
            return;
        }

        // If the first word is followed by a comma, assume that it's the family name
        if (tokens.hasComma(tokens.mStartPointer)) {
           name.familyName = tokens.mTokens[tokens.mStartPointer];
           tokens.mStartPointer++;
           return;
        }

        // If the second word is followed by a comma and the first word
        // is a last name prefix as in "de Sade" and "von Cliburn", treat
        // the first two words as the family name.
        if (tokens.mStartPointer + 1 < tokens.mEndPointer
                && tokens.hasComma(tokens.mStartPointer + 1)
                && isFamilyNamePrefix(tokens.mTokens[tokens.mStartPointer])) {
            String familyNamePrefix = tokens.mTokens[tokens.mStartPointer];
            if (tokens.hasDot(tokens.mStartPointer)) {
                familyNamePrefix += '.';
            }
            name.familyName = familyNamePrefix + " " + tokens.mTokens[tokens.mStartPointer + 1];
            tokens.mStartPointer += 2;
            return;
        }

        // Finally, assume that the last word is the last name
        name.familyName = tokens.mTokens[tokens.mEndPointer - 1];
        tokens.mEndPointer--;

        // Take care of last names like "de Sade" and "von Cliburn"
        if ((tokens.mEndPointer - tokens.mStartPointer) > 0) {
            String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1];
            if (isFamilyNamePrefix(lastNamePrefix)) {
                if (tokens.hasDot(tokens.mEndPointer - 1)) {
                    lastNamePrefix += '.';
                }
                name.familyName = lastNamePrefix + " " + name.familyName;
                tokens.mEndPointer--;
            }
        }
    }

    /**
     * Returns true if the supplied word is an accepted last name prefix, e.g. "von", "de"
     */
    private boolean isFamilyNamePrefix(String word) {
        final String normalized = word.toUpperCase();

        return mLastNamePrefixesSet.contains(normalized)
                || mLastNamePrefixesSet.contains(normalized + ".");
    }


    private void parseMiddleName(Name name, NameTokenizer tokens) {
        if (tokens.mStartPointer == tokens.mEndPointer) {
            return;
        }

        if ((tokens.mEndPointer - tokens.mStartPointer) > 1) {
            if ((tokens.mEndPointer - tokens.mStartPointer) == 2
                    || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2].
                            toUpperCase())) {
                name.middleName = tokens.mTokens[tokens.mEndPointer - 1];
                if (tokens.hasDot(tokens.mEndPointer - 1)) {
                    name.middleName += '.';
                }
                tokens.mEndPointer--;
            }
        }
    }

    private void parseGivenNames(Name name, NameTokenizer tokens) {
        if (tokens.mStartPointer == tokens.mEndPointer) {
            return;
        }

        if ((tokens.mEndPointer - tokens.mStartPointer) == 1) {
            name.givenNames = tokens.mTokens[tokens.mStartPointer];
        } else {
            StringBuilder sb = new StringBuilder();
            for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) {
                if (i != tokens.mStartPointer) {
                    sb.append(' ');
                }
                sb.append(tokens.mTokens[i]);
                if (tokens.hasDot(i)) {
                    sb.append('.');
                }
            }
            name.givenNames = sb.toString();
        }
    }

    /**
     * Makes the best guess at the expected full name style based on the character set
     * used in the supplied name.  If the phonetic name is also supplied, tries to
     * differentiate between Chinese, Japanese and Korean based on the alphabet used
     * for the phonetic name.
     */
    public void guessNameStyle(Name name) {
        guessFullNameStyle(name);
        guessPhoneticNameStyle(name);
        name.fullNameStyle = getAdjustedNameStyleBasedOnPhoneticNameStyle(name.fullNameStyle,
                name.phoneticNameStyle);
    }

    /**
     * Updates the display name style according to the phonetic name style if we
     * were unsure about display name style based on the name components, but
     * phonetic name makes it more definitive.
     */
    public int getAdjustedNameStyleBasedOnPhoneticNameStyle(int nameStyle, int phoneticNameStyle) {
        if (phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
            if (nameStyle == FullNameStyle.UNDEFINED || nameStyle == FullNameStyle.CJK) {
                if (phoneticNameStyle == PhoneticNameStyle.JAPANESE) {
                    return FullNameStyle.JAPANESE;
                } else if (phoneticNameStyle == PhoneticNameStyle.KOREAN) {
                    return FullNameStyle.KOREAN;
                }
                if (nameStyle == FullNameStyle.CJK && phoneticNameStyle == PhoneticNameStyle.PINYIN) {
                    return FullNameStyle.CHINESE;
                }
            }
        }
        return nameStyle;
    }

    /**
     * Makes the best guess at the expected full name style based on the character set
     * used in the supplied name.
     */
    private void guessFullNameStyle(NameSplitter.Name name) {
        if (name.fullNameStyle != FullNameStyle.UNDEFINED) {
            return;
        }

        int bestGuess = guessFullNameStyle(name.givenNames);
        // A mix of Hanzi and latin chars are common in China, so we have to go through all names
        // if the name is not JANPANESE or KOREAN.
        if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK
                && bestGuess != FullNameStyle.WESTERN) {
            name.fullNameStyle = bestGuess;
            return;
        }

        int guess = guessFullNameStyle(name.familyName);
        if (guess != FullNameStyle.UNDEFINED) {
            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
                name.fullNameStyle = guess;
                return;
            }
            bestGuess = guess;
        }

        guess = guessFullNameStyle(name.middleName);
        if (guess != FullNameStyle.UNDEFINED) {
            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
                name.fullNameStyle = guess;
                return;
            }
            bestGuess = guess;
        }

        guess = guessFullNameStyle(name.prefix);
        if (guess != FullNameStyle.UNDEFINED) {
            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
                name.fullNameStyle = guess;
                return;
            }
            bestGuess = guess;
        }

        guess = guessFullNameStyle(name.suffix);
        if (guess != FullNameStyle.UNDEFINED) {
            if (guess != FullNameStyle.CJK && guess != FullNameStyle.WESTERN) {
                name.fullNameStyle = guess;
                return;
            }
            bestGuess = guess;
        }

        name.fullNameStyle = bestGuess;
    }

    public int guessFullNameStyle(String name) {
        if (name == null) {
            return FullNameStyle.UNDEFINED;
        }

        int nameStyle = FullNameStyle.UNDEFINED;
        int length = name.length();
        int offset = 0;
        while (offset < length) {
            int codePoint = Character.codePointAt(name, offset);
            if (Character.isLetter(codePoint)) {
                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);

                if (!isLatinUnicodeBlock(unicodeBlock)) {

                    if (isCJKUnicodeBlock(unicodeBlock)) {
                        // We don't know if this is Chinese, Japanese or Korean -
                        // trying to figure out by looking at other characters in the name
                        return guessCJKNameStyle(name, offset + Character.charCount(codePoint));
                    }

                    if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
                        return FullNameStyle.JAPANESE;
                    }

                    if (isKoreanUnicodeBlock(unicodeBlock)) {
                        return FullNameStyle.KOREAN;
                    }
                }
                nameStyle = FullNameStyle.WESTERN;
            }
            offset += Character.charCount(codePoint);
        }
        return nameStyle;
    }

    private int guessCJKNameStyle(String name, int offset) {
        int length = name.length();
        while (offset < length) {
            int codePoint = Character.codePointAt(name, offset);
            if (Character.isLetter(codePoint)) {
                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
                if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
                    return FullNameStyle.JAPANESE;
                }
                if (isKoreanUnicodeBlock(unicodeBlock)) {
                    return FullNameStyle.KOREAN;
                }
            }
            offset += Character.charCount(codePoint);
        }

        return FullNameStyle.CJK;
    }

    private void guessPhoneticNameStyle(NameSplitter.Name name) {
        if (name.phoneticNameStyle != PhoneticNameStyle.UNDEFINED) {
            return;
        }

        int bestGuess = guessPhoneticNameStyle(name.phoneticFamilyName);
        if (bestGuess != FullNameStyle.UNDEFINED && bestGuess != FullNameStyle.CJK) {
            name.phoneticNameStyle = bestGuess;
            return;
        }

        int guess = guessPhoneticNameStyle(name.phoneticGivenName);
        if (guess != FullNameStyle.UNDEFINED) {
            if (guess != FullNameStyle.CJK) {
                name.phoneticNameStyle = guess;
                return;
            }
            bestGuess = guess;
        }

        guess = guessPhoneticNameStyle(name.phoneticMiddleName);
        if (guess != FullNameStyle.UNDEFINED) {
            if (guess != FullNameStyle.CJK) {
                name.phoneticNameStyle = guess;
                return;
            }
            bestGuess = guess;
        }
    }

    public int guessPhoneticNameStyle(String name) {
        if (name == null) {
            return PhoneticNameStyle.UNDEFINED;
        }

        int nameStyle = PhoneticNameStyle.UNDEFINED;
        int length = name.length();
        int offset = 0;
        while (offset < length) {
            int codePoint = Character.codePointAt(name, offset);
            if (Character.isLetter(codePoint)) {
                UnicodeBlock unicodeBlock = UnicodeBlock.of(codePoint);
                if (isJapanesePhoneticUnicodeBlock(unicodeBlock)) {
                    return PhoneticNameStyle.JAPANESE;
                }
                if (isKoreanUnicodeBlock(unicodeBlock)) {
                    return PhoneticNameStyle.KOREAN;
                }
                if (isLatinUnicodeBlock(unicodeBlock)) {
                    return PhoneticNameStyle.PINYIN;
                }
            }
            offset += Character.charCount(codePoint);
        }

        return nameStyle;
    }

    private static boolean isLatinUnicodeBlock(UnicodeBlock unicodeBlock) {
        return unicodeBlock == UnicodeBlock.BASIC_LATIN ||
                unicodeBlock == UnicodeBlock.LATIN_1_SUPPLEMENT ||
                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_A ||
                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_B ||
                unicodeBlock == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL;
    }

    private static boolean isCJKUnicodeBlock(UnicodeBlock block) {
        return block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
                || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
                || block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B
                || block == UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
                || block == UnicodeBlock.CJK_RADICALS_SUPPLEMENT
                || block == UnicodeBlock.CJK_COMPATIBILITY
                || block == UnicodeBlock.CJK_COMPATIBILITY_FORMS
                || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
                || block == UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT;
    }

    private static boolean isKoreanUnicodeBlock(UnicodeBlock unicodeBlock) {
        return unicodeBlock == UnicodeBlock.HANGUL_SYLLABLES ||
                unicodeBlock == UnicodeBlock.HANGUL_JAMO ||
                unicodeBlock == UnicodeBlock.HANGUL_COMPATIBILITY_JAMO;
    }

    private static boolean isJapanesePhoneticUnicodeBlock(UnicodeBlock unicodeBlock) {
        return unicodeBlock == UnicodeBlock.KATAKANA ||
                unicodeBlock == UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS ||
                unicodeBlock == UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS ||
                unicodeBlock == UnicodeBlock.HIRAGANA;
    }
}