StringUtils.java revision 2b39fab829ec93f7764d178167ac8d67e6e36491
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin.utils; 18 19import android.text.TextUtils; 20 21import com.android.inputmethod.annotations.UsedForTesting; 22import com.android.inputmethod.latin.Constants; 23import com.android.inputmethod.latin.settings.SettingsValues; 24 25import java.util.ArrayList; 26import java.util.Locale; 27 28public final class StringUtils { 29 public static final int CAPITALIZE_NONE = 0; // No caps, or mixed case 30 public static final int CAPITALIZE_FIRST = 1; // First only 31 public static final int CAPITALIZE_ALL = 2; // All caps 32 33 private StringUtils() { 34 // This utility class is not publicly instantiable. 35 } 36 37 public static int codePointCount(final String text) { 38 if (TextUtils.isEmpty(text)) return 0; 39 return text.codePointCount(0, text.length()); 40 } 41 42 public static boolean containsInArray(final String text, final String[] array) { 43 for (final String element : array) { 44 if (text.equals(element)) return true; 45 } 46 return false; 47 } 48 49 /** 50 * Comma-Splittable Text is similar to Comma-Separated Values (CSV) but has much simpler syntax. 51 * Unlike CSV, Comma-Splittable Text has no escaping mechanism, so that the text can't contain 52 * a comma character in it. 53 */ 54 private static final String SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT = ","; 55 56 public static boolean containsInCommaSplittableText(final String text, 57 final String extraValues) { 58 if (TextUtils.isEmpty(extraValues)) { 59 return false; 60 } 61 return containsInArray(text, extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT)); 62 } 63 64 public static String appendToCommaSplittableTextIfNotExists(final String text, 65 final String extraValues) { 66 if (TextUtils.isEmpty(extraValues)) { 67 return text; 68 } 69 if (containsInCommaSplittableText(text, extraValues)) { 70 return extraValues; 71 } 72 return extraValues + SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT + text; 73 } 74 75 public static String removeFromCommaSplittableTextIfExists(final String text, 76 final String extraValues) { 77 if (TextUtils.isEmpty(extraValues)) { 78 return ""; 79 } 80 final String[] elements = extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT); 81 if (!containsInArray(text, elements)) { 82 return extraValues; 83 } 84 final ArrayList<String> result = CollectionUtils.newArrayList(elements.length - 1); 85 for (final String element : elements) { 86 if (!text.equals(element)) { 87 result.add(element); 88 } 89 } 90 return TextUtils.join(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT, result); 91 } 92 93 /** 94 * Remove duplicates from an array of strings. 95 * 96 * This method will always keep the first occurrence of all strings at their position 97 * in the array, removing the subsequent ones. 98 */ 99 public static void removeDupes(final ArrayList<String> suggestions) { 100 if (suggestions.size() < 2) return; 101 int i = 1; 102 // Don't cache suggestions.size(), since we may be removing items 103 while (i < suggestions.size()) { 104 final String cur = suggestions.get(i); 105 // Compare each suggestion with each previous suggestion 106 for (int j = 0; j < i; j++) { 107 final String previous = suggestions.get(j); 108 if (TextUtils.equals(cur, previous)) { 109 suggestions.remove(i); 110 i--; 111 break; 112 } 113 } 114 i++; 115 } 116 } 117 118 public static String capitalizeFirstCodePoint(final String s, final Locale locale) { 119 if (s.length() <= 1) { 120 return s.toUpperCase(locale); 121 } 122 // Please refer to the comment below in 123 // {@link #capitalizeFirstAndDowncaseRest(String,Locale)} as this has the same shortcomings 124 final int cutoff = s.offsetByCodePoints(0, 1); 125 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff); 126 } 127 128 public static String capitalizeFirstAndDowncaseRest(final String s, final Locale locale) { 129 if (s.length() <= 1) { 130 return s.toUpperCase(locale); 131 } 132 // TODO: fix the bugs below 133 // - This does not work for Greek, because it returns upper case instead of title case. 134 // - It does not work for Serbian, because it fails to account for the "lj" character, 135 // which should be "Lj" in title case and "LJ" in upper case. 136 // - It does not work for Dutch, because it fails to account for the "ij" digraph when it's 137 // written as two separate code points. They are two different characters but both should 138 // be capitalized as "IJ" as if they were a single letter in most words (not all). If the 139 // unicode char for the ligature is used however, it works. 140 final int cutoff = s.offsetByCodePoints(0, 1); 141 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff).toLowerCase(locale); 142 } 143 144 private static final int[] EMPTY_CODEPOINTS = {}; 145 146 public static int[] toCodePointArray(final String string) { 147 final int length = string.length(); 148 if (length <= 0) { 149 return EMPTY_CODEPOINTS; 150 } 151 final int[] codePoints = new int[string.codePointCount(0, length)]; 152 int destIndex = 0; 153 for (int index = 0; index < length; index = string.offsetByCodePoints(index, 1)) { 154 codePoints[destIndex] = string.codePointAt(index); 155 destIndex++; 156 } 157 return codePoints; 158 } 159 160 // This method assumes the text is not null. For the empty string, it returns CAPITALIZE_NONE. 161 public static int getCapitalizationType(final String text) { 162 // If the first char is not uppercase, then the word is either all lower case or 163 // camel case, and in either case we return CAPITALIZE_NONE. 164 final int len = text.length(); 165 int index = 0; 166 for (; index < len; index = text.offsetByCodePoints(index, 1)) { 167 if (Character.isLetter(text.codePointAt(index))) { 168 break; 169 } 170 } 171 if (index == len) return CAPITALIZE_NONE; 172 if (!Character.isUpperCase(text.codePointAt(index))) { 173 return CAPITALIZE_NONE; 174 } 175 int capsCount = 1; 176 int letterCount = 1; 177 for (index = text.offsetByCodePoints(index, 1); index < len; 178 index = text.offsetByCodePoints(index, 1)) { 179 if (1 != capsCount && letterCount != capsCount) break; 180 final int codePoint = text.codePointAt(index); 181 if (Character.isUpperCase(codePoint)) { 182 ++capsCount; 183 ++letterCount; 184 } else if (Character.isLetter(codePoint)) { 185 // We need to discount non-letters since they may not be upper-case, but may 186 // still be part of a word (e.g. single quote or dash, as in "IT'S" or "FULL-TIME") 187 ++letterCount; 188 } 189 } 190 // We know the first char is upper case. So we want to test if either every letter other 191 // than the first is lower case, or if they are all upper case. If the string is exactly 192 // one char long, then we will arrive here with letterCount 1, and this is correct, too. 193 if (1 == capsCount) return CAPITALIZE_FIRST; 194 return (letterCount == capsCount ? CAPITALIZE_ALL : CAPITALIZE_NONE); 195 } 196 197 public static boolean isIdenticalAfterUpcase(final String text) { 198 final int length = text.length(); 199 int i = 0; 200 while (i < length) { 201 final int codePoint = text.codePointAt(i); 202 if (Character.isLetter(codePoint) && !Character.isUpperCase(codePoint)) { 203 return false; 204 } 205 i += Character.charCount(codePoint); 206 } 207 return true; 208 } 209 210 public static boolean isIdenticalAfterDowncase(final String text) { 211 final int length = text.length(); 212 int i = 0; 213 while (i < length) { 214 final int codePoint = text.codePointAt(i); 215 if (Character.isLetter(codePoint) && !Character.isLowerCase(codePoint)) { 216 return false; 217 } 218 i += Character.charCount(codePoint); 219 } 220 return true; 221 } 222 223 @UsedForTesting 224 public static boolean looksValidForDictionaryInsertion(final CharSequence text, 225 final SettingsValues settings) { 226 if (TextUtils.isEmpty(text)) return false; 227 final int length = text.length(); 228 int i = 0; 229 int digitCount = 0; 230 while (i < length) { 231 final int codePoint = Character.codePointAt(text, i); 232 final int charCount = Character.charCount(codePoint); 233 i += charCount; 234 if (Character.isDigit(codePoint)) { 235 // Count digits: see below 236 digitCount += charCount; 237 continue; 238 } 239 if (!settings.isWordCodePoint(codePoint)) return false; 240 } 241 // We reject strings entirely comprised of digits to avoid using PIN codes or credit 242 // card numbers. It would come in handy for word prediction though; a good example is 243 // when writing one's address where the street number is usually quite discriminative, 244 // as well as the postal code. 245 return digitCount < length; 246 } 247 248 public static boolean isIdenticalAfterCapitalizeEachWord(final String text, 249 final String separators) { 250 boolean needCapsNext = true; 251 final int len = text.length(); 252 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 253 final int codePoint = text.codePointAt(i); 254 if (Character.isLetter(codePoint)) { 255 if ((needCapsNext && !Character.isUpperCase(codePoint)) 256 || (!needCapsNext && !Character.isLowerCase(codePoint))) { 257 return false; 258 } 259 } 260 // We need a capital letter next if this is a separator. 261 needCapsNext = (-1 != separators.indexOf(codePoint)); 262 } 263 return true; 264 } 265 266 // TODO: like capitalizeFirst*, this does not work perfectly for Dutch because of the IJ digraph 267 // which should be capitalized together in *some* cases. 268 public static String capitalizeEachWord(final String text, final String separators, 269 final Locale locale) { 270 final StringBuilder builder = new StringBuilder(); 271 boolean needCapsNext = true; 272 final int len = text.length(); 273 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 274 final String nextChar = text.substring(i, text.offsetByCodePoints(i, 1)); 275 if (needCapsNext) { 276 builder.append(nextChar.toUpperCase(locale)); 277 } else { 278 builder.append(nextChar.toLowerCase(locale)); 279 } 280 // We need a capital letter next if this is a separator. 281 needCapsNext = (-1 != separators.indexOf(nextChar.codePointAt(0))); 282 } 283 return builder.toString(); 284 } 285 286 /** 287 * Approximates whether the text before the cursor looks like a URL. 288 * 289 * This is not foolproof, but it should work well in the practice. 290 * Essentially it walks backward from the cursor until it finds something that's not a letter, 291 * digit, or common URL symbol like underscore. If it hasn't found a period yet, then it 292 * does not look like a URL. 293 * If the text: 294 * - starts with www and contains a period 295 * - starts with a slash preceded by either a slash, whitespace, or start-of-string 296 * Then it looks like a URL and we return true. Otherwise, we return false. 297 * 298 * Note: this method is called quite often, and should be fast. 299 * 300 * TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the 301 * code complexity, but ideally it should not. It's acceptable for now. 302 */ 303 public static boolean lastPartLooksLikeURL(final CharSequence text) { 304 int i = text.length(); 305 if (0 == i) return false; 306 int wCount = 0; 307 int slashCount = 0; 308 boolean hasSlash = false; 309 boolean hasPeriod = false; 310 int codePoint = 0; 311 while (i > 0) { 312 codePoint = Character.codePointBefore(text, i); 313 if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') { 314 // Handwavy heuristic to see if that's a URL character. Anything between period 315 // and z. This includes all lower- and upper-case ascii letters, period, 316 // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation 317 // marks, double quotes... 318 // Anything that's not a URL-like character causes us to break from here and 319 // evaluate normally. 320 break; 321 } 322 if (Constants.CODE_PERIOD == codePoint) { 323 hasPeriod = true; 324 } 325 if (Constants.CODE_SLASH == codePoint) { 326 hasSlash = true; 327 if (2 == ++slashCount) { 328 return true; 329 } 330 } else { 331 slashCount = 0; 332 } 333 if ('w' == codePoint) { 334 ++wCount; 335 } else { 336 wCount = 0; 337 } 338 i = Character.offsetByCodePoints(text, i, -1); 339 } 340 // End of the text run. 341 // If it starts with www and includes a period, then it looks like a URL. 342 if (wCount >= 3 && hasPeriod) return true; 343 // If it starts with a slash, and the code point before is whitespace, it looks like an URL. 344 if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) return true; 345 // If it has both a period and a slash, it looks like an URL. 346 if (hasPeriod && hasSlash) return true; 347 // Otherwise, it doesn't look like an URL. 348 return false; 349 } 350 351 public static boolean isEmptyStringOrWhiteSpaces(String s) { 352 final int N = codePointCount(s); 353 for (int i = 0; i < N; ++i) { 354 if (!Character.isWhitespace(s.codePointAt(i))) { 355 return false; 356 } 357 } 358 return true; 359 } 360} 361