StringUtils.java revision 837cdd738b7ddbeac04b15230f01e44d247bd50a
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin.utils; 18 19import android.text.TextUtils; 20 21import com.android.inputmethod.annotations.UsedForTesting; 22import com.android.inputmethod.latin.Constants; 23 24import java.util.ArrayList; 25import java.util.Arrays; 26import java.util.Locale; 27 28public final class StringUtils { 29 private static final String TAG = StringUtils.class.getSimpleName(); 30 public static final int CAPITALIZE_NONE = 0; // No caps, or mixed case 31 public static final int CAPITALIZE_FIRST = 1; // First only 32 public static final int CAPITALIZE_ALL = 2; // All caps 33 34 private static final String EMPTY_STRING = ""; 35 36 private StringUtils() { 37 // This utility class is not publicly instantiable. 38 } 39 40 public static int codePointCount(final String text) { 41 if (TextUtils.isEmpty(text)) return 0; 42 return text.codePointCount(0, text.length()); 43 } 44 45 public static String newSingleCodePointString(int codePoint) { 46 if (Character.charCount(codePoint) == 1) { 47 // Optimization: avoid creating an temporary array for characters that are 48 // represented by a single char value 49 return String.valueOf((char) codePoint); 50 } 51 // For surrogate pair 52 return new String(Character.toChars(codePoint)); 53 } 54 55 public static boolean containsInArray(final String text, final String[] array) { 56 for (final String element : array) { 57 if (text.equals(element)) return true; 58 } 59 return false; 60 } 61 62 /** 63 * Comma-Splittable Text is similar to Comma-Separated Values (CSV) but has much simpler syntax. 64 * Unlike CSV, Comma-Splittable Text has no escaping mechanism, so that the text can't contain 65 * a comma character in it. 66 */ 67 private static final String SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT = ","; 68 69 public static boolean containsInCommaSplittableText(final String text, 70 final String extraValues) { 71 if (TextUtils.isEmpty(extraValues)) { 72 return false; 73 } 74 return containsInArray(text, extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT)); 75 } 76 77 public static String joinCommaSplittableText(final String head, final String tail) { 78 if (TextUtils.isEmpty(head) && TextUtils.isEmpty(tail)) { 79 return EMPTY_STRING; 80 } 81 // Here either head or tail is not null. 82 if (TextUtils.isEmpty(head)) { 83 return tail; 84 } 85 if (TextUtils.isEmpty(tail)) { 86 return head; 87 } 88 return head + SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT + tail; 89 } 90 91 public static String appendToCommaSplittableTextIfNotExists(final String text, 92 final String extraValues) { 93 if (TextUtils.isEmpty(extraValues)) { 94 return text; 95 } 96 if (containsInCommaSplittableText(text, extraValues)) { 97 return extraValues; 98 } 99 return extraValues + SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT + text; 100 } 101 102 public static String removeFromCommaSplittableTextIfExists(final String text, 103 final String extraValues) { 104 if (TextUtils.isEmpty(extraValues)) { 105 return EMPTY_STRING; 106 } 107 final String[] elements = extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT); 108 if (!containsInArray(text, elements)) { 109 return extraValues; 110 } 111 final ArrayList<String> result = CollectionUtils.newArrayList(elements.length - 1); 112 for (final String element : elements) { 113 if (!text.equals(element)) { 114 result.add(element); 115 } 116 } 117 return TextUtils.join(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT, result); 118 } 119 120 /** 121 * Remove duplicates from an array of strings. 122 * 123 * This method will always keep the first occurrence of all strings at their position 124 * in the array, removing the subsequent ones. 125 */ 126 public static void removeDupes(final ArrayList<String> suggestions) { 127 if (suggestions.size() < 2) return; 128 int i = 1; 129 // Don't cache suggestions.size(), since we may be removing items 130 while (i < suggestions.size()) { 131 final String cur = suggestions.get(i); 132 // Compare each suggestion with each previous suggestion 133 for (int j = 0; j < i; j++) { 134 final String previous = suggestions.get(j); 135 if (TextUtils.equals(cur, previous)) { 136 suggestions.remove(i); 137 i--; 138 break; 139 } 140 } 141 i++; 142 } 143 } 144 145 public static String capitalizeFirstCodePoint(final String s, final Locale locale) { 146 if (s.length() <= 1) { 147 return s.toUpperCase(locale); 148 } 149 // Please refer to the comment below in 150 // {@link #capitalizeFirstAndDowncaseRest(String,Locale)} as this has the same shortcomings 151 final int cutoff = s.offsetByCodePoints(0, 1); 152 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff); 153 } 154 155 public static String capitalizeFirstAndDowncaseRest(final String s, final Locale locale) { 156 if (s.length() <= 1) { 157 return s.toUpperCase(locale); 158 } 159 // TODO: fix the bugs below 160 // - This does not work for Greek, because it returns upper case instead of title case. 161 // - It does not work for Serbian, because it fails to account for the "lj" character, 162 // which should be "Lj" in title case and "LJ" in upper case. 163 // - It does not work for Dutch, because it fails to account for the "ij" digraph when it's 164 // written as two separate code points. They are two different characters but both should 165 // be capitalized as "IJ" as if they were a single letter in most words (not all). If the 166 // unicode char for the ligature is used however, it works. 167 final int cutoff = s.offsetByCodePoints(0, 1); 168 return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff).toLowerCase(locale); 169 } 170 171 private static final int[] EMPTY_CODEPOINTS = {}; 172 173 public static int[] toCodePointArray(final String string) { 174 final int length = string.length(); 175 if (length <= 0) { 176 return EMPTY_CODEPOINTS; 177 } 178 final int[] codePoints = new int[string.codePointCount(0, length)]; 179 int destIndex = 0; 180 for (int index = 0; index < length; index = string.offsetByCodePoints(index, 1)) { 181 codePoints[destIndex] = string.codePointAt(index); 182 destIndex++; 183 } 184 return codePoints; 185 } 186 187 public static int[] toSortedCodePointArray(final String string) { 188 final int[] codePoints = toCodePointArray(string); 189 Arrays.sort(codePoints); 190 return codePoints; 191 } 192 193 // This method assumes the text is not null. For the empty string, it returns CAPITALIZE_NONE. 194 public static int getCapitalizationType(final String text) { 195 // If the first char is not uppercase, then the word is either all lower case or 196 // camel case, and in either case we return CAPITALIZE_NONE. 197 final int len = text.length(); 198 int index = 0; 199 for (; index < len; index = text.offsetByCodePoints(index, 1)) { 200 if (Character.isLetter(text.codePointAt(index))) { 201 break; 202 } 203 } 204 if (index == len) return CAPITALIZE_NONE; 205 if (!Character.isUpperCase(text.codePointAt(index))) { 206 return CAPITALIZE_NONE; 207 } 208 int capsCount = 1; 209 int letterCount = 1; 210 for (index = text.offsetByCodePoints(index, 1); index < len; 211 index = text.offsetByCodePoints(index, 1)) { 212 if (1 != capsCount && letterCount != capsCount) break; 213 final int codePoint = text.codePointAt(index); 214 if (Character.isUpperCase(codePoint)) { 215 ++capsCount; 216 ++letterCount; 217 } else if (Character.isLetter(codePoint)) { 218 // We need to discount non-letters since they may not be upper-case, but may 219 // still be part of a word (e.g. single quote or dash, as in "IT'S" or "FULL-TIME") 220 ++letterCount; 221 } 222 } 223 // We know the first char is upper case. So we want to test if either every letter other 224 // than the first is lower case, or if they are all upper case. If the string is exactly 225 // one char long, then we will arrive here with letterCount 1, and this is correct, too. 226 if (1 == capsCount) return CAPITALIZE_FIRST; 227 return (letterCount == capsCount ? CAPITALIZE_ALL : CAPITALIZE_NONE); 228 } 229 230 public static boolean isIdenticalAfterUpcase(final String text) { 231 final int length = text.length(); 232 int i = 0; 233 while (i < length) { 234 final int codePoint = text.codePointAt(i); 235 if (Character.isLetter(codePoint) && !Character.isUpperCase(codePoint)) { 236 return false; 237 } 238 i += Character.charCount(codePoint); 239 } 240 return true; 241 } 242 243 public static boolean isIdenticalAfterDowncase(final String text) { 244 final int length = text.length(); 245 int i = 0; 246 while (i < length) { 247 final int codePoint = text.codePointAt(i); 248 if (Character.isLetter(codePoint) && !Character.isLowerCase(codePoint)) { 249 return false; 250 } 251 i += Character.charCount(codePoint); 252 } 253 return true; 254 } 255 256 /** 257 * Returns true if all code points in text are whitespace, false otherwise. Empty is true. 258 */ 259 // Interestingly enough, U+00A0 NO-BREAK SPACE and U+200B ZERO-WIDTH SPACE are not considered 260 // whitespace, while EN SPACE, EM SPACE and IDEOGRAPHIC SPACES are. 261 public static boolean containsOnlyWhitespace(final String text) { 262 final int length = text.length(); 263 int i = 0; 264 while (i < length) { 265 final int codePoint = text.codePointAt(i); 266 if (!Character.isWhitespace(codePoint)) { 267 return false; 268 } 269 i += Character.charCount(codePoint); 270 } 271 return true; 272 } 273 274 public static boolean isIdenticalAfterCapitalizeEachWord(final String text, 275 final String separators) { 276 boolean needCapsNext = true; 277 final int len = text.length(); 278 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 279 final int codePoint = text.codePointAt(i); 280 if (Character.isLetter(codePoint)) { 281 if ((needCapsNext && !Character.isUpperCase(codePoint)) 282 || (!needCapsNext && !Character.isLowerCase(codePoint))) { 283 return false; 284 } 285 } 286 // We need a capital letter next if this is a separator. 287 needCapsNext = (-1 != separators.indexOf(codePoint)); 288 } 289 return true; 290 } 291 292 // TODO: like capitalizeFirst*, this does not work perfectly for Dutch because of the IJ digraph 293 // which should be capitalized together in *some* cases. 294 public static String capitalizeEachWord(final String text, final String separators, 295 final Locale locale) { 296 final StringBuilder builder = new StringBuilder(); 297 boolean needCapsNext = true; 298 final int len = text.length(); 299 for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) { 300 final String nextChar = text.substring(i, text.offsetByCodePoints(i, 1)); 301 if (needCapsNext) { 302 builder.append(nextChar.toUpperCase(locale)); 303 } else { 304 builder.append(nextChar.toLowerCase(locale)); 305 } 306 // We need a capital letter next if this is a separator. 307 needCapsNext = (-1 != separators.indexOf(nextChar.codePointAt(0))); 308 } 309 return builder.toString(); 310 } 311 312 /** 313 * Approximates whether the text before the cursor looks like a URL. 314 * 315 * This is not foolproof, but it should work well in the practice. 316 * Essentially it walks backward from the cursor until it finds something that's not a letter, 317 * digit, or common URL symbol like underscore. If it hasn't found a period yet, then it 318 * does not look like a URL. 319 * If the text: 320 * - starts with www and contains a period 321 * - starts with a slash preceded by either a slash, whitespace, or start-of-string 322 * Then it looks like a URL and we return true. Otherwise, we return false. 323 * 324 * Note: this method is called quite often, and should be fast. 325 * 326 * TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the 327 * code complexity, but ideally it should not. It's acceptable for now. 328 */ 329 public static boolean lastPartLooksLikeURL(final CharSequence text) { 330 int i = text.length(); 331 if (0 == i) return false; 332 int wCount = 0; 333 int slashCount = 0; 334 boolean hasSlash = false; 335 boolean hasPeriod = false; 336 int codePoint = 0; 337 while (i > 0) { 338 codePoint = Character.codePointBefore(text, i); 339 if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') { 340 // Handwavy heuristic to see if that's a URL character. Anything between period 341 // and z. This includes all lower- and upper-case ascii letters, period, 342 // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation 343 // marks, double quotes... 344 // Anything that's not a URL-like character causes us to break from here and 345 // evaluate normally. 346 break; 347 } 348 if (Constants.CODE_PERIOD == codePoint) { 349 hasPeriod = true; 350 } 351 if (Constants.CODE_SLASH == codePoint) { 352 hasSlash = true; 353 if (2 == ++slashCount) { 354 return true; 355 } 356 } else { 357 slashCount = 0; 358 } 359 if ('w' == codePoint) { 360 ++wCount; 361 } else { 362 wCount = 0; 363 } 364 i = Character.offsetByCodePoints(text, i, -1); 365 } 366 // End of the text run. 367 // If it starts with www and includes a period, then it looks like a URL. 368 if (wCount >= 3 && hasPeriod) return true; 369 // If it starts with a slash, and the code point before is whitespace, it looks like an URL. 370 if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) return true; 371 // If it has both a period and a slash, it looks like an URL. 372 if (hasPeriod && hasSlash) return true; 373 // Otherwise, it doesn't look like an URL. 374 return false; 375 } 376 377 public static boolean isEmptyStringOrWhiteSpaces(final String s) { 378 final int N = codePointCount(s); 379 for (int i = 0; i < N; ++i) { 380 if (!Character.isWhitespace(s.codePointAt(i))) { 381 return false; 382 } 383 } 384 return true; 385 } 386 387 @UsedForTesting 388 public static String byteArrayToHexString(final byte[] bytes) { 389 if (bytes == null || bytes.length == 0) { 390 return EMPTY_STRING; 391 } 392 final StringBuilder sb = new StringBuilder(); 393 for (byte b : bytes) { 394 sb.append(String.format("%02x", b & 0xff)); 395 } 396 return sb.toString(); 397 } 398 399 /** 400 * Convert hex string to byte array. The string length must be an even number. 401 */ 402 @UsedForTesting 403 public static byte[] hexStringToByteArray(final String hexString) { 404 if (TextUtils.isEmpty(hexString)) { 405 return null; 406 } 407 final int N = hexString.length(); 408 if (N % 2 != 0) { 409 throw new NumberFormatException("Input hex string length must be an even number." 410 + " Length = " + N); 411 } 412 final byte[] bytes = new byte[N / 2]; 413 for (int i = 0; i < N; i += 2) { 414 bytes[i / 2] = (byte) ((Character.digit(hexString.charAt(i), 16) << 4) 415 + Character.digit(hexString.charAt(i + 1), 16)); 416 } 417 return bytes; 418 } 419} 420