NameSplitter.java revision c19e02a37399c55b852d6570f73553e859b0139a
1/* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License 15 */ 16package com.android.providers.contacts; 17 18import java.util.HashSet; 19import java.util.StringTokenizer; 20 21/** 22 * The purpose of this class is to split a full name into given names and last 23 * name. The logic only supports having a single last name. If the full name has 24 * multiple last names the output will be incorrect. 25 * <p> 26 * Core algorithm: 27 * <ol> 28 * <li>Remove the suffixes (III, Ph.D., M.D.).</li> 29 * <li>Remove the prefixes (Mr., Pastor, Reverend, Sir).</li> 30 * <li>Assign the last remaining token as the last name.</li> 31 * <li>If the previous word to the last name is one from LASTNAME_PREFIXES, use 32 * this word also as the last name.</li> 33 * <li>Assign the rest of the words as the "given names".</li> 34 * </ol> 35 */ 36public class NameSplitter { 37 38 private final HashSet<String> mPrefixesSet; 39 private final HashSet<String> mSuffixesSet; 40 private final int mMaxSuffixLength; 41 private final HashSet<String> mLastNamePrefixesSet; 42 private final HashSet<String> mConjuctions; 43 44 public static class Name { 45 private String prefix; 46 private String givenNames; 47 private String middleName; 48 private String familyName; 49 private String suffix; 50 51 public String getPrefix() { 52 return prefix; 53 } 54 55 public String getGivenNames() { 56 return givenNames; 57 } 58 59 public String getMiddleName() { 60 return middleName; 61 } 62 63 public String getFamilyName() { 64 return familyName; 65 } 66 67 public String getSuffix() { 68 return suffix; 69 } 70 } 71 72 private static class NameTokenizer extends StringTokenizer { 73 private static final int MAX_TOKENS = 10; 74 private final String[] mTokens; 75 private int mDotBitmask; 76 private int mStartPointer; 77 private int mEndPointer; 78 79 public NameTokenizer(String fullName) { 80 super(fullName, " .,", true); 81 82 mTokens = new String[MAX_TOKENS]; 83 84 // Iterate over tokens, skipping over empty ones and marking tokens that 85 // are followed by dots. 86 while (hasMoreTokens() && mEndPointer < MAX_TOKENS) { 87 final String token = nextToken(); 88 if (token.length() > 0) { 89 final char c = token.charAt(0); 90 if (c == ' ' || c == ',') { 91 continue; 92 } 93 } 94 95 if (mEndPointer > 0 && token.charAt(0) == '.') { 96 mDotBitmask |= (1 << (mEndPointer - 1)); 97 } else { 98 mTokens[mEndPointer] = token; 99 mEndPointer++; 100 } 101 } 102 } 103 104 /** 105 * Returns true if the token is followed by a dot in the original full name. 106 */ 107 public boolean hasDot(int index) { 108 return (mDotBitmask & (1 << index)) != 0; 109 } 110 } 111 112 /** 113 * Constructor. 114 * 115 * @param commonPrefixes comma-separated list of common prefixes, 116 * e.g. "Mr, Ms, Mrs" 117 * @param commonLastNamePrefixes comma-separated list of common last name prefixes, 118 * e.g. "d', st, st., von" 119 * @param commonSuffixes comma-separated list of common suffixes, 120 * e.g. "Jr, M.D., MD, D.D.S." 121 * @param commonConjunctions comma-separated list of common conjuctions, 122 * e.g. "AND, Or" 123 */ 124 public NameSplitter(String commonPrefixes, String commonLastNamePrefixes, 125 String commonSuffixes, String commonConjunctions) { 126 mPrefixesSet = convertToSet(commonPrefixes); 127 mLastNamePrefixesSet = convertToSet(commonLastNamePrefixes); 128 mSuffixesSet = convertToSet(commonSuffixes); 129 mConjuctions = convertToSet(commonConjunctions); 130 131 int maxLength = 0; 132 for (String suffix : mSuffixesSet) { 133 if (suffix.length() > maxLength) { 134 maxLength = suffix.length(); 135 } 136 } 137 138 mMaxSuffixLength = maxLength; 139 } 140 141 /** 142 * Converts a comma-separated list of Strings to a set of Strings. Trims strings 143 * and converts them to upper case. 144 */ 145 private static HashSet<String> convertToSet(String strings) { 146 HashSet<String> set = new HashSet<String>(); 147 if (strings != null) { 148 String[] split = strings.split(","); 149 for (int i = 0; i < split.length; i++) { 150 set.add(split[i].trim().toUpperCase()); 151 } 152 } 153 return set; 154 } 155 156 /** 157 * Parses a full name and returns parsed components in the Name object. 158 */ 159 public void split(Name name, String fullName) { 160 if (fullName == null) { 161 return; 162 } 163 164 NameTokenizer tokens = new NameTokenizer(fullName); 165 parsePrefix(name, tokens); 166 167 // If the name consists of just one or two tokens, treat them as first/last name, 168 // not as suffix. Example: John Ma; Ma is last name, not "M.A.". 169 if (tokens.mEndPointer > 2) { 170 parseSuffix(name, tokens); 171 } 172 173 parseLastName(name, tokens); 174 parseMiddleName(name, tokens); 175 parseGivenNames(name, tokens); 176 } 177 178 /** 179 * Parses the first word from the name if it is a prefix. 180 */ 181 private void parsePrefix(Name name, NameTokenizer tokens) { 182 if (tokens.mStartPointer == tokens.mEndPointer) { 183 return; 184 } 185 186 String firstToken = tokens.mTokens[tokens.mStartPointer]; 187 if (mPrefixesSet.contains(firstToken.toUpperCase())) { 188 name.prefix = firstToken; 189 tokens.mStartPointer++; 190 } 191 } 192 193 /** 194 * Parses the last word(s) from the name if it is a suffix. 195 */ 196 private void parseSuffix(Name name, NameTokenizer tokens) { 197 if (tokens.mStartPointer == tokens.mEndPointer) { 198 return; 199 } 200 201 String lastToken = tokens.mTokens[tokens.mEndPointer - 1]; 202 if (lastToken.length() > mMaxSuffixLength) { 203 return; 204 } 205 206 String normalized = lastToken.toUpperCase(); 207 if (mSuffixesSet.contains(normalized)) { 208 name.suffix = lastToken; 209 tokens.mEndPointer--; 210 return; 211 } 212 213 if (tokens.hasDot(tokens.mEndPointer - 1)) { 214 lastToken += '.'; 215 } 216 normalized += "."; 217 218 // Take care of suffixes like M.D. and D.D.S. 219 int pos = tokens.mEndPointer - 1; 220 while (normalized.length() <= mMaxSuffixLength) { 221 222 if (mSuffixesSet.contains(normalized)) { 223 name.suffix = lastToken; 224 tokens.mEndPointer = pos; 225 return; 226 } 227 228 if (pos == tokens.mStartPointer) { 229 break; 230 } 231 232 pos--; 233 if (tokens.hasDot(pos)) { 234 lastToken = tokens.mTokens[pos] + "." + lastToken; 235 } else { 236 lastToken = tokens.mTokens[pos] + " " + lastToken; 237 } 238 239 normalized = tokens.mTokens[pos].toUpperCase() + "." + normalized; 240 } 241 } 242 243 private void parseLastName(Name name, NameTokenizer tokens) { 244 if (tokens.mStartPointer == tokens.mEndPointer) { 245 return; 246 } 247 248 name.familyName = tokens.mTokens[tokens.mEndPointer - 1]; 249 tokens.mEndPointer--; 250 251 // Take care of last names like "D'Onofrio" and "von Cliburn" 252 if ((tokens.mEndPointer - tokens.mStartPointer) > 0) { 253 String lastNamePrefix = tokens.mTokens[tokens.mEndPointer - 1]; 254 final String normalized = lastNamePrefix.toUpperCase(); 255 if (mLastNamePrefixesSet.contains(normalized) 256 || mLastNamePrefixesSet.contains(normalized + ".")) { 257 if (tokens.hasDot(tokens.mEndPointer - 1)) { 258 lastNamePrefix += '.'; 259 } 260 name.familyName = lastNamePrefix + " " + name.familyName; 261 tokens.mEndPointer--; 262 } 263 } 264 } 265 266 267 private void parseMiddleName(Name name, NameTokenizer tokens) { 268 if (tokens.mStartPointer == tokens.mEndPointer) { 269 return; 270 } 271 272 if ((tokens.mEndPointer - tokens.mStartPointer) > 1) { 273 if ((tokens.mEndPointer - tokens.mStartPointer) == 2 274 || !mConjuctions.contains(tokens.mTokens[tokens.mEndPointer - 2]. 275 toUpperCase())) { 276 name.middleName = tokens.mTokens[tokens.mEndPointer - 1]; 277 tokens.mEndPointer--; 278 } 279 } 280 } 281 282 private void parseGivenNames(Name name, NameTokenizer tokens) { 283 if (tokens.mStartPointer == tokens.mEndPointer) { 284 return; 285 } 286 287 if ((tokens.mEndPointer - tokens.mStartPointer) == 1) { 288 name.givenNames = tokens.mTokens[tokens.mStartPointer]; 289 } else { 290 StringBuilder sb = new StringBuilder(); 291 for (int i = tokens.mStartPointer; i < tokens.mEndPointer; i++) { 292 if (i != tokens.mStartPointer) { 293 sb.append(' '); 294 } 295 sb.append(tokens.mTokens[i]); 296 if (tokens.hasDot(i)) { 297 sb.append('.'); 298 } 299 } 300 name.givenNames = sb.toString(); 301 } 302 } 303} 304