NameSplitter.java revision c19e02a37399c55b852d6570f73553e859b0139a
1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16package com.android.providers.contacts;
17
18import java.util.HashSet;
19import java.util.StringTokenizer;
20
21/**
22 * The purpose of this class is to split a full name into given names and last
23 * name. The logic only supports having a single last name. If the full name has
24 * multiple last names the output will be incorrect.
25 * <p>
26 * Core algorithm:
27 * <ol>
28 * <li>Remove the suffixes (III, Ph.D., M.D.).</li>
29 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li>
30 * <li>Assign the last remaining token as the last name.</li>
31 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use
32 * this word also as the last name.</li>
33 * <li>Assign the rest of the words as the "given names".</li>
34 * </ol>
35 */
36public class NameSplitter {
37
38    private final HashSet<String> mPrefixesSet;
39    private final HashSet<String> mSuffixesSet;
40    private final int mMaxSuffixLength;
41    private final HashSet<String> mLastNamePrefixesSet;
42    private final HashSet<String> mConjuctions;
43
44    public static class Name {
45        private String prefix;
46        private String givenNames;
47        private String middleName;
48        private String familyName;
49        private String suffix;
50
51        public String getPrefix() {
52            return prefix;
53        }
54
55        public String getGivenNames() {
56            return givenNames;
57        }
58
59        public String getMiddleName() {
60            return middleName;
61        }
62
63        public String getFamilyName() {
64            return familyName;
65        }
66
67        public String getSuffix() {
68            return suffix;
69        }
70    }
71
72    private static class NameTokenizer extends StringTokenizer {
73        private static final int MAX_TOKENS = 10;
74        private final String[] mTokens;
75        private int mDotBitmask;
76        private int mStartPointer;
77        private int mEndPointer;
78
79        public NameTokenizer(String fullName) {
80            super(fullName, " .,", true);
81
82            mTokens = new String[MAX_TOKENS];
83
84            // Iterate over tokens, skipping over empty ones and marking tokens that
85            // are followed by dots.
86            while (hasMoreTokens() && mEndPointer < MAX_TOKENS) {
87                final String token = nextToken();
88                if (token.length() > 0) {
89                    final char c = token.charAt(0);
90                    if (c == ' ' || c == ',') {
91                        continue;
92                    }
93                }
94
95                if (mEndPointer > 0 && token.charAt(0) == '.') {
96                    mDotBitmask |= (1 << (mEndPointer - 1));
97                } else {
98                    mTokens[mEndPointer] = token;
99                    mEndPointer++;
100                }
101            }
102        }
103
104        /**
105         * Returns true if the token is followed by a dot in the original full name.
106         */
107        public boolean hasDot(int index) {
108            return (mDotBitmask & (1 << index)) != 0;
109        }
110    }
111
112    /**
113     * Constructor.
114     *
115     * @param commonPrefixes comma-separated list of common prefixes,
116     *            e.g. "Mr, Ms, Mrs"
117     * @param commonLastNamePrefixes comma-separated list of common last name prefixes,
118     *           e.g. "d', st, st., von"
119     * @param commonSuffixes comma-separated list of common suffixes,
120     *            e.g. "Jr, M.D., MD, D.D.S."
121     * @param commonConjunctions comma-separated list of common conjuctions,
122     *            e.g. "AND, Or"
123     */
124    public NameSplitter(String commonPrefixes, String commonLastNamePrefixes,
125            String commonSuffixes, String commonConjunctions) {
126        mPrefixesSet = convertToSet(commonPrefixes);
127        mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes);
128        mSuffixesSet = convertToSet(commonSuffixes);
129        mConjuctions = convertToSet(commonConjunctions);
130
131        int maxLength = 0;
132        for (String suffix : mSuffixesSet) {
133            if (suffix.length() > maxLength) {
134                maxLength = suffix.length();
135            }
136        }
137
138        mMaxSuffixLength = maxLength;
139    }
140
141    /**
142     * Converts a comma-separated list of Strings to a set of Strings. Trims strings
143     * and converts them to upper case.
144     */
145    private static HashSet<String> convertToSet(String strings) {
146        HashSet<String> set = new HashSet<String>();
147        if (strings != null) {
148            String[] split = strings.split(",");
149            for (int i = 0; i < split.length; i++) {
150                set.add(split[i].trim().toUpperCase());
151            }
152        }
153        return set;
154    }
155
156    /**
157     * Parses a full name and returns parsed components in the Name object.
158     */
159    public void split(Name name, String fullName) {
160        if (fullName == null) {
161            return;
162        }
163
164        NameTokenizer tokens = new NameTokenizer(fullName);
165        parsePrefix(name, tokens);
166
167        // If the name consists of just one or two tokens, treat them as first/last name,
168        // not as suffix.  Example: John Ma; Ma is last name, not "M.A.".
169        if (tokens.mEndPointer > 2) {
170            parseSuffix(name, tokens);
171        }
172
173        parseLastName(name, tokens);
174        parseMiddleName(name, tokens);
175        parseGivenNames(name, tokens);
176    }
177
178    /**
179     * Parses the first word from the name if it is a prefix.
180     */
181    private void parsePrefix(Name name, NameTokenizer tokens) {
182        if (tokens.mStartPointer == tokens.mEndPointer) {
183            return;
184        }
185
186        String firstToken = tokens.mTokens[tokens.mStartPointer];
187        if (mPrefixesSet.contains(firstToken.toUpperCase())) {
188            name.prefix = firstToken;
189            tokens.mStartPointer++;
190        }
191    }
192
193    /**
194     * Parses the last word(s) from the name if it is a suffix.
195     */
196    private void parseSuffix(Name name, NameTokenizer tokens) {
197        if (tokens.mStartPointer == tokens.mEndPointer) {
198            return;
199        }
200
201        String lastToken = tokens.mTokens[tokens.mEndPointer - 1];
202        if (lastToken.length() > mMaxSuffixLength) {
203            return;
204        }
205
206        String normalized = lastToken.toUpperCase();
207        if (mSuffixesSet.contains(normalized)) {
208            name.suffix = lastToken;
209            tokens.mEndPointer--;
210            return;
211        }
212
213        if (tokens.hasDot(tokens.mEndPointer - 1)) {
214            lastToken += '.';
215        }
216        normalized += ".";
217
218        // Take care of suffixes like M.D. and D.D.S.
219        int pos = tokens.mEndPointer - 1;
220        while (normalized.length() <= mMaxSuffixLength) {
221
222            if (mSuffixesSet.contains(normalized)) {
223                name.suffix = lastToken;
224                tokens.mEndPointer = pos;
225                return;
226            }
227
228            if (pos == tokens.mStartPointer) {
229                break;
230            }
231
232            pos--;
233            if (tokens.hasDot(pos)) {
234                lastToken = tokens.mTokens[pos] + "." + lastToken;
235            } else {
236                lastToken = tokens.mTokens[pos] + " " + lastToken;
237            }
238
239            normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized;
240        }
241    }
242
243    private void parseLastName(Name name, NameTokenizer tokens) {
244        if (tokens.mStartPointer == tokens.mEndPointer) {
245            return;
246        }
247
248        name.familyName = tokens.mTokens[tokens.mEndPointer - 1];
249        tokens.mEndPointer--;
250
251        // Take care of last names like "D'Onofrio" and "von Cliburn"
252        if ((tokens.mEndPointer - tokens.mStartPointer) > 0) {
253            String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1];
254            final String normalized = lastNamePrefix.toUpperCase();
255            if (mLastNamePrefixesSet.contains(normalized)
256                    || mLastNamePrefixesSet.contains(normalized + ".")) {
257                if (tokens.hasDot(tokens.mEndPointer - 1)) {
258                    lastNamePrefix += '.';
259                }
260                name.familyName = lastNamePrefix + " " + name.familyName;
261                tokens.mEndPointer--;
262            }
263        }
264    }
265
266
267    private void parseMiddleName(Name name, NameTokenizer tokens) {
268        if (tokens.mStartPointer == tokens.mEndPointer) {
269            return;
270        }
271
272        if ((tokens.mEndPointer - tokens.mStartPointer) > 1) {
273            if ((tokens.mEndPointer - tokens.mStartPointer) == 2
274                    || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2].
275                            toUpperCase())) {
276                name.middleName = tokens.mTokens[tokens.mEndPointer - 1];
277                tokens.mEndPointer--;
278            }
279        }
280    }
281
282    private void parseGivenNames(Name name, NameTokenizer tokens) {
283        if (tokens.mStartPointer == tokens.mEndPointer) {
284            return;
285        }
286
287        if ((tokens.mEndPointer - tokens.mStartPointer) == 1) {
288            name.givenNames = tokens.mTokens[tokens.mStartPointer];
289        } else {
290            StringBuilder sb = new StringBuilder();
291            for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) {
292                if (i != tokens.mStartPointer) {
293                    sb.append(' ');
294                }
295                sb.append(tokens.mTokens[i]);
296                if (tokens.hasDot(i)) {
297                    sb.append('.');
298                }
299            }
300            name.givenNames = sb.toString();
301        }
302    }
303}
304