StringUtils.java revision 73065b778c30eecd5913cca5ac42746537dec495
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin.utils; 18 19import static com.android.inputmethod.latin.Constants.CODE_UNSPECIFIED; 20 21import android.text.TextUtils; 22 23import com.android.inputmethod.annotations.UsedForTesting; 24import com.android.inputmethod.latin.Constants; 25 26import java.util.ArrayList; 27import java.util.Arrays; 28import java.util.Locale; 29 30public final class StringUtils { 31 public static final int CAPITALIZE_NONE = 0; // No caps, or mixed case 32 public static final int CAPITALIZE_FIRST = 1; // First only 33 public static final int CAPITALIZE_ALL = 2; // All caps 34 35 private static final String EMPTY_STRING = ""; 36 37 private StringUtils() { 38 // This utility class is not publicly instantiable. 39 } 40 41 public static int codePointCount(final String text) { 42 if (TextUtils.isEmpty(text)) return 0; 43 return text.codePointCount(0, text.length()); 44 } 45 46 public static String newSingleCodePointString(int codePoint) { 47 if (Character.charCount(codePoint) == 1) { 48 // Optimization: avoid creating a temporary array for characters that are 49 // represented by a single char value 50 return String.valueOf((char) codePoint); 51 } 52 // For surrogate pair 53 return new String(Character.toChars(codePoint)); 54 } 55 56 public static boolean containsInArray(final String text, final String[] array) { 57 for (final String element : array) { 58 if (text.equals(element)) return true; 59 } 60 return false; 61 } 62 63 /** 64 * Comma-Splittable Text is similar to Comma-Separated Values (CSV) but has much simpler syntax. 65 * Unlike CSV, Comma-Splittable Text has no escaping mechanism, so that the text can't contain 66 * a comma character in it. 67 */ 68 private static final String SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT = ","; 69 70 public static boolean containsInCommaSplittableText(final String text, 71 final String extraValues) { 72 if (TextUtils.isEmpty(extraValues)) { 73 return false; 74 } 75 return containsInArray(text, extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT)); 76 } 77 78 public static String removeFromCommaSplittableTextIfExists(final String text, 79 final String extraValues) { 80 if (TextUtils.isEmpty(extraValues)) { 81 return EMPTY_STRING; 82 } 83 final String[] elements = extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT); 84 if (!containsInArray(text, elements)) { 85 return extraValues; 86 } 87 final ArrayList<String> result = new ArrayList<>(elements.length - 1); 88 for (final String element : elements) { 89 if (!text.equals(element)) { 90 result.add(element); 91 } 92 } 93 return TextUtils.join(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT, result); 94 } 95 96 /** 97 * Remove duplicates from an array of strings. 98 * 99 * This method will always keep the first occurrence of all strings at their position 100 * in the array, removing the subsequent ones. 101 */ 102 public static void removeDupes(final ArrayList<String> suggestions) { 103 if (suggestions.size() < 2) return; 104 int i = 1; 105 // Don't cache suggestions.size(), since we may be removing items 106 while (i < suggestions.size()) { 107 final String cur = suggestions.get(i); 108 // Compare each suggestion with each previous suggestion 109 for (int j = 0; j < i; j++) { 110 final String previous = suggestions.get(j); 111 if (TextUtils.equals(cur, previous)) { 112 suggestions.remove(i); 113 i--; 114 break; 115 } 116 } 117 i++; 118 } 119 } 120 121 public static String capitalizeFirstCodePoint(final String s, final Locale locale) { 122 if (s.length() <= 1) { 123 return s.toUpperCase(locale); 124 } 125 // Please refer to the comment below in 126 // {@link #capitalizeFirstAndDowncaseRest(String,Locale)} as this has the same shortcomings 127 final int cutoff = s.offsetByCodePoints(0, 1); 128 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff); 129 } 130 131 public static String capitalizeFirstAndDowncaseRest(final String s, final Locale locale) { 132 if (s.length() <= 1) { 133 return s.toUpperCase(locale); 134 } 135 // TODO: fix the bugs below 136 // - This does not work for Greek, because it returns upper case instead of title case. 137 // - It does not work for Serbian, because it fails to account for the "lj" character, 138 // which should be "Lj" in title case and "LJ" in upper case. 139 // - It does not work for Dutch, because it fails to account for the "ij" digraph when it's 140 // written as two separate code points. They are two different characters but both should 141 // be capitalized as "IJ" as if they were a single letter in most words (not all). If the 142 // unicode char for the ligature is used however, it works. 143 final int cutoff = s.offsetByCodePoints(0, 1); 144 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff).toLowerCase(locale); 145 } 146 147 private static final int[] EMPTY_CODEPOINTS = {}; 148 149 public static int[] toCodePointArray(final CharSequence charSequence) { 150 return toCodePointArray(charSequence, 0, charSequence.length()); 151 } 152 153 /** 154 * Converts a range of a string to an array of code points. 155 * @param charSequence the source string. 156 * @param startIndex the start index inside the string in java chars, inclusive. 157 * @param endIndex the end index inside the string in java chars, exclusive. 158 * @return a new array of code points. At most endIndex - startIndex, but possibly less. 159 */ 160 public static int[] toCodePointArray(final CharSequence charSequence, 161 final int startIndex, final int endIndex) { 162 final int length = charSequence.length(); 163 if (length <= 0) { 164 return EMPTY_CODEPOINTS; 165 } 166 final int[] codePoints = 167 new int[Character.codePointCount(charSequence, startIndex, endIndex)]; 168 copyCodePointsAndReturnCodePointCount(codePoints, charSequence, startIndex, endIndex, 169 false /* downCase */); 170 return codePoints; 171 } 172 173 /** 174 * Copies the codepoints in a CharSequence to an int array. 175 * 176 * This method assumes there is enough space in the array to store the code points. The size 177 * can be measured with Character#codePointCount(CharSequence, int, int) before passing to this 178 * method. If the int array is too small, an ArrayIndexOutOfBoundsException will be thrown. 179 * Also, this method makes no effort to be thread-safe. Do not modify the CharSequence while 180 * this method is running, or the behavior is undefined. 181 * This method can optionally downcase code points before copying them, but it pays no attention 182 * to locale while doing so. 183 * 184 * @param destination the int array. 185 * @param charSequence the CharSequence. 186 * @param startIndex the start index inside the string in java chars, inclusive. 187 * @param endIndex the end index inside the string in java chars, exclusive. 188 * @param downCase if this is true, code points will be downcased before being copied. 189 * @return the number of copied code points. 190 */ 191 public static int copyCodePointsAndReturnCodePointCount(final int[] destination, 192 final CharSequence charSequence, final int startIndex, final int endIndex, 193 final boolean downCase) { 194 int destIndex = 0; 195 for (int index = startIndex; index < endIndex; 196 index = Character.offsetByCodePoints(charSequence, index, 1)) { 197 final int codePoint = Character.codePointAt(charSequence, index); 198 // TODO: stop using this, as it's not aware of the locale and does not always do 199 // the right thing. 200 destination[destIndex] = downCase ? Character.toLowerCase(codePoint) : codePoint; 201 destIndex++; 202 } 203 return destIndex; 204 } 205 206 public static int[] toSortedCodePointArray(final String string) { 207 final int[] codePoints = toCodePointArray(string); 208 Arrays.sort(codePoints); 209 return codePoints; 210 } 211 212 /** 213 * Construct a String from a code point array 214 * 215 * @param codePoints a code point array that is null terminated when its logical length is 216 * shorter than the array length. 217 * @return a string constructed from the code point array. 218 */ 219 public static String getStringFromNullTerminatedCodePointArray(final int[] codePoints) { 220 int stringLength = codePoints.length; 221 for (int i = 0; i < codePoints.length; i++) { 222 if (codePoints[i] == 0) { 223 stringLength = i; 224 break; 225 } 226 } 227 return new String(codePoints, 0 /* offset */, stringLength); 228 } 229 230 // This method assumes the text is not null. For the empty string, it returns CAPITALIZE_NONE. 231 public static int getCapitalizationType(final String text) { 232 // If the first char is not uppercase, then the word is either all lower case or 233 // camel case, and in either case we return CAPITALIZE_NONE. 234 final int len = text.length(); 235 int index = 0; 236 for (; index < len; index = text.offsetByCodePoints(index, 1)) { 237 if (Character.isLetter(text.codePointAt(index))) { 238 break; 239 } 240 } 241 if (index == len) return CAPITALIZE_NONE; 242 if (!Character.isUpperCase(text.codePointAt(index))) { 243 return CAPITALIZE_NONE; 244 } 245 int capsCount = 1; 246 int letterCount = 1; 247 for (index = text.offsetByCodePoints(index, 1); index < len; 248 index = text.offsetByCodePoints(index, 1)) { 249 if (1 != capsCount && letterCount != capsCount) break; 250 final int codePoint = text.codePointAt(index); 251 if (Character.isUpperCase(codePoint)) { 252 ++capsCount; 253 ++letterCount; 254 } else if (Character.isLetter(codePoint)) { 255 // We need to discount non-letters since they may not be upper-case, but may 256 // still be part of a word (e.g. single quote or dash, as in "IT'S" or "FULL-TIME") 257 ++letterCount; 258 } 259 } 260 // We know the first char is upper case. So we want to test if either every letter other 261 // than the first is lower case, or if they are all upper case. If the string is exactly 262 // one char long, then we will arrive here with letterCount 1, and this is correct, too. 263 if (1 == capsCount) return CAPITALIZE_FIRST; 264 return (letterCount == capsCount ? CAPITALIZE_ALL : CAPITALIZE_NONE); 265 } 266 267 public static boolean isIdenticalAfterUpcase(final String text) { 268 final int length = text.length(); 269 int i = 0; 270 while (i < length) { 271 final int codePoint = text.codePointAt(i); 272 if (Character.isLetter(codePoint) && !Character.isUpperCase(codePoint)) { 273 return false; 274 } 275 i += Character.charCount(codePoint); 276 } 277 return true; 278 } 279 280 public static boolean isIdenticalAfterDowncase(final String text) { 281 final int length = text.length(); 282 int i = 0; 283 while (i < length) { 284 final int codePoint = text.codePointAt(i); 285 if (Character.isLetter(codePoint) && !Character.isLowerCase(codePoint)) { 286 return false; 287 } 288 i += Character.charCount(codePoint); 289 } 290 return true; 291 } 292 293 public static boolean isIdenticalAfterCapitalizeEachWord(final String text, 294 final int[] sortedSeparators) { 295 boolean needsCapsNext = true; 296 final int len = text.length(); 297 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 298 final int codePoint = text.codePointAt(i); 299 if (Character.isLetter(codePoint)) { 300 if ((needsCapsNext && !Character.isUpperCase(codePoint)) 301 || (!needsCapsNext && !Character.isLowerCase(codePoint))) { 302 return false; 303 } 304 } 305 // We need a capital letter next if this is a separator. 306 needsCapsNext = (Arrays.binarySearch(sortedSeparators, codePoint) >= 0); 307 } 308 return true; 309 } 310 311 // TODO: like capitalizeFirst*, this does not work perfectly for Dutch because of the IJ digraph 312 // which should be capitalized together in *some* cases. 313 public static String capitalizeEachWord(final String text, final int[] sortedSeparators, 314 final Locale locale) { 315 final StringBuilder builder = new StringBuilder(); 316 boolean needsCapsNext = true; 317 final int len = text.length(); 318 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 319 final String nextChar = text.substring(i, text.offsetByCodePoints(i, 1)); 320 if (needsCapsNext) { 321 builder.append(nextChar.toUpperCase(locale)); 322 } else { 323 builder.append(nextChar.toLowerCase(locale)); 324 } 325 // We need a capital letter next if this is a separator. 326 needsCapsNext = (Arrays.binarySearch(sortedSeparators, nextChar.codePointAt(0)) >= 0); 327 } 328 return builder.toString(); 329 } 330 331 /** 332 * Approximates whether the text before the cursor looks like a URL. 333 * 334 * This is not foolproof, but it should work well in the practice. 335 * Essentially it walks backward from the cursor until it finds something that's not a letter, 336 * digit, or common URL symbol like underscore. If it hasn't found a period yet, then it 337 * does not look like a URL. 338 * If the text: 339 * - starts with www and contains a period 340 * - starts with a slash preceded by either a slash, whitespace, or start-of-string 341 * Then it looks like a URL and we return true. Otherwise, we return false. 342 * 343 * Note: this method is called quite often, and should be fast. 344 * 345 * TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the 346 * code complexity, but ideally it should not. It's acceptable for now. 347 */ 348 public static boolean lastPartLooksLikeURL(final CharSequence text) { 349 int i = text.length(); 350 if (0 == i) return false; 351 int wCount = 0; 352 int slashCount = 0; 353 boolean hasSlash = false; 354 boolean hasPeriod = false; 355 int codePoint = 0; 356 while (i > 0) { 357 codePoint = Character.codePointBefore(text, i); 358 if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') { 359 // Handwavy heuristic to see if that's a URL character. Anything between period 360 // and z. This includes all lower- and upper-case ascii letters, period, 361 // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation 362 // marks, double quotes... 363 // Anything that's not a URL-like character causes us to break from here and 364 // evaluate normally. 365 break; 366 } 367 if (Constants.CODE_PERIOD == codePoint) { 368 hasPeriod = true; 369 } 370 if (Constants.CODE_SLASH == codePoint) { 371 hasSlash = true; 372 if (2 == ++slashCount) { 373 return true; 374 } 375 } else { 376 slashCount = 0; 377 } 378 if ('w' == codePoint) { 379 ++wCount; 380 } else { 381 wCount = 0; 382 } 383 i = Character.offsetByCodePoints(text, i, -1); 384 } 385 // End of the text run. 386 // If it starts with www and includes a period, then it looks like a URL. 387 if (wCount >= 3 && hasPeriod) return true; 388 // If it starts with a slash, and the code point before is whitespace, it looks like an URL. 389 if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) return true; 390 // If it has both a period and a slash, it looks like an URL. 391 if (hasPeriod && hasSlash) return true; 392 // Otherwise, it doesn't look like an URL. 393 return false; 394 } 395 396 /** 397 * Examines the string and returns whether we're inside a double quote. 398 * 399 * This is used to decide whether we should put an automatic space before or after a double 400 * quote character. If we're inside a quotation, then we want to close it, so we want a space 401 * after and not before. Otherwise, we want to open the quotation, so we want a space before 402 * and not after. Exception: after a digit, we never want a space because the "inch" or 403 * "minutes" use cases is dominant after digits. 404 * In the practice, we determine whether we are in a quotation or not by finding the previous 405 * double quote character, and looking at whether it's followed by whitespace. If so, that 406 * was a closing quotation mark, so we're not inside a double quote. If it's not followed 407 * by whitespace, then it was an opening quotation mark, and we're inside a quotation. 408 * 409 * @param text the text to examine. 410 * @return whether we're inside a double quote. 411 */ 412 public static boolean isInsideDoubleQuoteOrAfterDigit(final CharSequence text) { 413 int i = text.length(); 414 if (0 == i) return false; 415 int codePoint = Character.codePointBefore(text, i); 416 if (Character.isDigit(codePoint)) return true; 417 int prevCodePoint = 0; 418 while (i > 0) { 419 codePoint = Character.codePointBefore(text, i); 420 if (Constants.CODE_DOUBLE_QUOTE == codePoint) { 421 // If we see a double quote followed by whitespace, then that 422 // was a closing quote. 423 if (Character.isWhitespace(prevCodePoint)) return false; 424 } 425 if (Character.isWhitespace(codePoint) && Constants.CODE_DOUBLE_QUOTE == prevCodePoint) { 426 // If we see a double quote preceded by whitespace, then that 427 // was an opening quote. No need to continue seeking. 428 return true; 429 } 430 i -= Character.charCount(codePoint); 431 prevCodePoint = codePoint; 432 } 433 // We reached the start of text. If the first char is a double quote, then we're inside 434 // a double quote. Otherwise we're not. 435 return Constants.CODE_DOUBLE_QUOTE == codePoint; 436 } 437 438 public static boolean isEmptyStringOrWhiteSpaces(final String s) { 439 final int N = codePointCount(s); 440 for (int i = 0; i < N; ++i) { 441 if (!Character.isWhitespace(s.codePointAt(i))) { 442 return false; 443 } 444 } 445 return true; 446 } 447 448 @UsedForTesting 449 public static String byteArrayToHexString(final byte[] bytes) { 450 if (bytes == null || bytes.length == 0) { 451 return EMPTY_STRING; 452 } 453 final StringBuilder sb = new StringBuilder(); 454 for (byte b : bytes) { 455 sb.append(String.format("%02x", b & 0xff)); 456 } 457 return sb.toString(); 458 } 459 460 /** 461 * Convert hex string to byte array. The string length must be an even number. 462 */ 463 @UsedForTesting 464 public static byte[] hexStringToByteArray(final String hexString) { 465 if (TextUtils.isEmpty(hexString)) { 466 return null; 467 } 468 final int N = hexString.length(); 469 if (N % 2 != 0) { 470 throw new NumberFormatException("Input hex string length must be an even number." 471 + " Length = " + N); 472 } 473 final byte[] bytes = new byte[N / 2]; 474 for (int i = 0; i < N; i += 2) { 475 bytes[i / 2] = (byte) ((Character.digit(hexString.charAt(i), 16) << 4) 476 + Character.digit(hexString.charAt(i + 1), 16)); 477 } 478 return bytes; 479 } 480 481 public static String toUpperCaseOfStringForLocale(final String text, 482 final boolean needsToUpperCase, final Locale locale) { 483 if (text == null || !needsToUpperCase) return text; 484 return text.toUpperCase(locale); 485 } 486 487 public static int toUpperCaseOfCodeForLocale(final int code, final boolean needsToUpperCase, 488 final Locale locale) { 489 if (!Constants.isLetterCode(code) || !needsToUpperCase) return code; 490 final String text = newSingleCodePointString(code); 491 final String casedText = toUpperCaseOfStringForLocale( 492 text, needsToUpperCase, locale); 493 return codePointCount(casedText) == 1 494 ? casedText.codePointAt(0) : CODE_UNSPECIFIED; 495 } 496 497 public static int getTrailingSingleQuotesCount(final CharSequence charSequence) { 498 final int lastIndex = charSequence.length() - 1; 499 int i = lastIndex; 500 while (i >= 0 && charSequence.charAt(i) == Constants.CODE_SINGLE_QUOTE) { 501 --i; 502 } 503 return lastIndex - i; 504 } 505 506 @UsedForTesting 507 public static class Stringizer<E> { 508 public String stringize(final E element) { 509 return element != null ? element.toString() : "null"; 510 } 511 512 @UsedForTesting 513 public final String join(final E[] array) { 514 return joinStringArray(toStringArray(array), null /* delimiter */); 515 } 516 517 @UsedForTesting 518 public final String join(final E[] array, final String delimiter) { 519 return joinStringArray(toStringArray(array), delimiter); 520 } 521 522 protected String[] toStringArray(final E[] array) { 523 final String[] stringArray = new String[array.length]; 524 for (int index = 0; index < array.length; index++) { 525 stringArray[index] = stringize(array[index]); 526 } 527 return stringArray; 528 } 529 530 protected String joinStringArray(final String[] stringArray, final String delimiter) { 531 if (stringArray == null) { 532 return "null"; 533 } 534 if (delimiter == null) { 535 return Arrays.toString(stringArray); 536 } 537 final StringBuilder sb = new StringBuilder(); 538 for (int index = 0; index < stringArray.length; index++) { 539 sb.append(index == 0 ? "[" : delimiter); 540 sb.append(stringArray[index]); 541 } 542 return sb + "]"; 543 } 544 } 545} 546