1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.utils;
18
19import com.android.inputmethod.annotations.UsedForTesting;
20import com.android.inputmethod.latin.Constants;
21import com.android.inputmethod.latin.settings.SettingsValues;
22
23import android.text.TextUtils;
24import android.util.JsonReader;
25import android.util.JsonWriter;
26import android.util.Log;
27
28import java.io.IOException;
29import java.io.StringReader;
30import java.io.StringWriter;
31import java.util.ArrayList;
32import java.util.Collections;
33import java.util.List;
34import java.util.Locale;
35
36public final class StringUtils {
37    private static final String TAG = StringUtils.class.getSimpleName();
38    public static final int CAPITALIZE_NONE = 0;  // No caps, or mixed case
39    public static final int CAPITALIZE_FIRST = 1; // First only
40    public static final int CAPITALIZE_ALL = 2;   // All caps
41
42    private StringUtils() {
43        // This utility class is not publicly instantiable.
44    }
45
46    public static int codePointCount(final String text) {
47        if (TextUtils.isEmpty(text)) return 0;
48        return text.codePointCount(0, text.length());
49    }
50
51    public static String newSingleCodePointString(int codePoint) {
52        if (Character.charCount(codePoint) == 1) {
53            // Optimization: avoid creating an temporary array for characters that are
54            // represented by a single char value
55            return String.valueOf((char) codePoint);
56        }
57        // For surrogate pair
58        return new String(Character.toChars(codePoint));
59    }
60
61    public static boolean containsInArray(final String text, final String[] array) {
62        for (final String element : array) {
63            if (text.equals(element)) return true;
64        }
65        return false;
66    }
67
68    /**
69     * Comma-Splittable Text is similar to Comma-Separated Values (CSV) but has much simpler syntax.
70     * Unlike CSV, Comma-Splittable Text has no escaping mechanism, so that the text can't contain
71     * a comma character in it.
72     */
73    private static final String SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT = ",";
74
75    public static boolean containsInCommaSplittableText(final String text,
76            final String extraValues) {
77        if (TextUtils.isEmpty(extraValues)) {
78            return false;
79        }
80        return containsInArray(text, extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT));
81    }
82
83    public static String appendToCommaSplittableTextIfNotExists(final String text,
84            final String extraValues) {
85        if (TextUtils.isEmpty(extraValues)) {
86            return text;
87        }
88        if (containsInCommaSplittableText(text, extraValues)) {
89            return extraValues;
90        }
91        return extraValues + SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT + text;
92    }
93
94    public static String removeFromCommaSplittableTextIfExists(final String text,
95            final String extraValues) {
96        if (TextUtils.isEmpty(extraValues)) {
97            return "";
98        }
99        final String[] elements = extraValues.split(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT);
100        if (!containsInArray(text, elements)) {
101            return extraValues;
102        }
103        final ArrayList<String> result = CollectionUtils.newArrayList(elements.length - 1);
104        for (final String element : elements) {
105            if (!text.equals(element)) {
106                result.add(element);
107            }
108        }
109        return TextUtils.join(SEPARATOR_FOR_COMMA_SPLITTABLE_TEXT, result);
110    }
111
112    /**
113     * Remove duplicates from an array of strings.
114     *
115     * This method will always keep the first occurrence of all strings at their position
116     * in the array, removing the subsequent ones.
117     */
118    public static void removeDupes(final ArrayList<String> suggestions) {
119        if (suggestions.size() < 2) return;
120        int i = 1;
121        // Don't cache suggestions.size(), since we may be removing items
122        while (i < suggestions.size()) {
123            final String cur = suggestions.get(i);
124            // Compare each suggestion with each previous suggestion
125            for (int j = 0; j < i; j++) {
126                final String previous = suggestions.get(j);
127                if (TextUtils.equals(cur, previous)) {
128                    suggestions.remove(i);
129                    i--;
130                    break;
131                }
132            }
133            i++;
134        }
135    }
136
137    public static String capitalizeFirstCodePoint(final String s, final Locale locale) {
138        if (s.length() <= 1) {
139            return s.toUpperCase(locale);
140        }
141        // Please refer to the comment below in
142        // {@link #capitalizeFirstAndDowncaseRest(String,Locale)} as this has the same shortcomings
143        final int cutoff = s.offsetByCodePoints(0, 1);
144        return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff);
145    }
146
147    public static String capitalizeFirstAndDowncaseRest(final String s, final Locale locale) {
148        if (s.length() <= 1) {
149            return s.toUpperCase(locale);
150        }
151        // TODO: fix the bugs below
152        // - This does not work for Greek, because it returns upper case instead of title case.
153        // - It does not work for Serbian, because it fails to account for the "lj" character,
154        // which should be "Lj" in title case and "LJ" in upper case.
155        // - It does not work for Dutch, because it fails to account for the "ij" digraph when it's
156        // written as two separate code points. They are two different characters but both should
157        // be capitalized as "IJ" as if they were a single letter in most words (not all). If the
158        // unicode char for the ligature is used however, it works.
159        final int cutoff = s.offsetByCodePoints(0, 1);
160        return s.substring(0, cutoff).toUpperCase(locale) + s.substring(cutoff).toLowerCase(locale);
161    }
162
163    private static final int[] EMPTY_CODEPOINTS = {};
164
165    public static int[] toCodePointArray(final String string) {
166        final int length = string.length();
167        if (length <= 0) {
168            return EMPTY_CODEPOINTS;
169        }
170        final int[] codePoints = new int[string.codePointCount(0, length)];
171        int destIndex = 0;
172        for (int index = 0; index < length; index = string.offsetByCodePoints(index, 1)) {
173            codePoints[destIndex] = string.codePointAt(index);
174            destIndex++;
175        }
176        return codePoints;
177    }
178
179    // This method assumes the text is not null. For the empty string, it returns CAPITALIZE_NONE.
180    public static int getCapitalizationType(final String text) {
181        // If the first char is not uppercase, then the word is either all lower case or
182        // camel case, and in either case we return CAPITALIZE_NONE.
183        final int len = text.length();
184        int index = 0;
185        for (; index < len; index = text.offsetByCodePoints(index, 1)) {
186            if (Character.isLetter(text.codePointAt(index))) {
187                break;
188            }
189        }
190        if (index == len) return CAPITALIZE_NONE;
191        if (!Character.isUpperCase(text.codePointAt(index))) {
192            return CAPITALIZE_NONE;
193        }
194        int capsCount = 1;
195        int letterCount = 1;
196        for (index = text.offsetByCodePoints(index, 1); index < len;
197                index = text.offsetByCodePoints(index, 1)) {
198            if (1 != capsCount && letterCount != capsCount) break;
199            final int codePoint = text.codePointAt(index);
200            if (Character.isUpperCase(codePoint)) {
201                ++capsCount;
202                ++letterCount;
203            } else if (Character.isLetter(codePoint)) {
204                // We need to discount non-letters since they may not be upper-case, but may
205                // still be part of a word (e.g. single quote or dash, as in "IT'S" or "FULL-TIME")
206                ++letterCount;
207            }
208        }
209        // We know the first char is upper case. So we want to test if either every letter other
210        // than the first is lower case, or if they are all upper case. If the string is exactly
211        // one char long, then we will arrive here with letterCount 1, and this is correct, too.
212        if (1 == capsCount) return CAPITALIZE_FIRST;
213        return (letterCount == capsCount ? CAPITALIZE_ALL : CAPITALIZE_NONE);
214    }
215
216    public static boolean isIdenticalAfterUpcase(final String text) {
217        final int length = text.length();
218        int i = 0;
219        while (i < length) {
220            final int codePoint = text.codePointAt(i);
221            if (Character.isLetter(codePoint) && !Character.isUpperCase(codePoint)) {
222                return false;
223            }
224            i += Character.charCount(codePoint);
225        }
226        return true;
227    }
228
229    public static boolean isIdenticalAfterDowncase(final String text) {
230        final int length = text.length();
231        int i = 0;
232        while (i < length) {
233            final int codePoint = text.codePointAt(i);
234            if (Character.isLetter(codePoint) && !Character.isLowerCase(codePoint)) {
235                return false;
236            }
237            i += Character.charCount(codePoint);
238        }
239        return true;
240    }
241
242    @UsedForTesting
243    public static boolean looksValidForDictionaryInsertion(final CharSequence text,
244            final SettingsValues settings) {
245        if (TextUtils.isEmpty(text)) return false;
246        final int length = text.length();
247        int i = 0;
248        int digitCount = 0;
249        while (i < length) {
250            final int codePoint = Character.codePointAt(text, i);
251            final int charCount = Character.charCount(codePoint);
252            i += charCount;
253            if (Character.isDigit(codePoint)) {
254                // Count digits: see below
255                digitCount += charCount;
256                continue;
257            }
258            if (!settings.isWordCodePoint(codePoint)) return false;
259        }
260        // We reject strings entirely comprised of digits to avoid using PIN codes or credit
261        // card numbers. It would come in handy for word prediction though; a good example is
262        // when writing one's address where the street number is usually quite discriminative,
263        // as well as the postal code.
264        return digitCount < length;
265    }
266
267    public static boolean isIdenticalAfterCapitalizeEachWord(final String text,
268            final String separators) {
269        boolean needCapsNext = true;
270        final int len = text.length();
271        for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) {
272            final int codePoint = text.codePointAt(i);
273            if (Character.isLetter(codePoint)) {
274                if ((needCapsNext && !Character.isUpperCase(codePoint))
275                        || (!needCapsNext && !Character.isLowerCase(codePoint))) {
276                    return false;
277                }
278            }
279            // We need a capital letter next if this is a separator.
280            needCapsNext = (-1 != separators.indexOf(codePoint));
281        }
282        return true;
283    }
284
285    // TODO: like capitalizeFirst*, this does not work perfectly for Dutch because of the IJ digraph
286    // which should be capitalized together in *some* cases.
287    public static String capitalizeEachWord(final String text, final String separators,
288            final Locale locale) {
289        final StringBuilder builder = new StringBuilder();
290        boolean needCapsNext = true;
291        final int len = text.length();
292        for (int i = 0; i < len; i = text.offsetByCodePoints(i, 1)) {
293            final String nextChar = text.substring(i, text.offsetByCodePoints(i, 1));
294            if (needCapsNext) {
295                builder.append(nextChar.toUpperCase(locale));
296            } else {
297                builder.append(nextChar.toLowerCase(locale));
298            }
299            // We need a capital letter next if this is a separator.
300            needCapsNext = (-1 != separators.indexOf(nextChar.codePointAt(0)));
301        }
302        return builder.toString();
303    }
304
305    /**
306     * Approximates whether the text before the cursor looks like a URL.
307     *
308     * This is not foolproof, but it should work well in the practice.
309     * Essentially it walks backward from the cursor until it finds something that's not a letter,
310     * digit, or common URL symbol like underscore. If it hasn't found a period yet, then it
311     * does not look like a URL.
312     * If the text:
313     * - starts with www and contains a period
314     * - starts with a slash preceded by either a slash, whitespace, or start-of-string
315     * Then it looks like a URL and we return true. Otherwise, we return false.
316     *
317     * Note: this method is called quite often, and should be fast.
318     *
319     * TODO: This will return that "abc./def" and ".abc/def" look like URLs to keep down the
320     * code complexity, but ideally it should not. It's acceptable for now.
321     */
322    public static boolean lastPartLooksLikeURL(final CharSequence text) {
323        int i = text.length();
324        if (0 == i) return false;
325        int wCount = 0;
326        int slashCount = 0;
327        boolean hasSlash = false;
328        boolean hasPeriod = false;
329        int codePoint = 0;
330        while (i > 0) {
331            codePoint =  Character.codePointBefore(text, i);
332            if (codePoint < Constants.CODE_PERIOD || codePoint > 'z') {
333                // Handwavy heuristic to see if that's a URL character. Anything between period
334                // and z. This includes all lower- and upper-case ascii letters, period,
335                // underscore, arrobase, question mark, equal sign. It excludes spaces, exclamation
336                // marks, double quotes...
337                // Anything that's not a URL-like character causes us to break from here and
338                // evaluate normally.
339                break;
340            }
341            if (Constants.CODE_PERIOD == codePoint) {
342                hasPeriod = true;
343            }
344            if (Constants.CODE_SLASH == codePoint) {
345                hasSlash = true;
346                if (2 == ++slashCount) {
347                    return true;
348                }
349            } else {
350                slashCount = 0;
351            }
352            if ('w' == codePoint) {
353                ++wCount;
354            } else {
355                wCount = 0;
356            }
357            i = Character.offsetByCodePoints(text, i, -1);
358        }
359        // End of the text run.
360        // If it starts with www and includes a period, then it looks like a URL.
361        if (wCount >= 3 && hasPeriod) return true;
362        // If it starts with a slash, and the code point before is whitespace, it looks like an URL.
363        if (1 == slashCount && (0 == i || Character.isWhitespace(codePoint))) return true;
364        // If it has both a period and a slash, it looks like an URL.
365        if (hasPeriod && hasSlash) return true;
366        // Otherwise, it doesn't look like an URL.
367        return false;
368    }
369
370    public static boolean isEmptyStringOrWhiteSpaces(String s) {
371        final int N = codePointCount(s);
372        for (int i = 0; i < N; ++i) {
373            if (!Character.isWhitespace(s.codePointAt(i))) {
374                return false;
375            }
376        }
377        return true;
378    }
379
380    @UsedForTesting
381    public static String byteArrayToHexString(byte[] bytes) {
382        if (bytes == null || bytes.length == 0) {
383            return "";
384        }
385        final StringBuilder sb = new StringBuilder();
386        for (byte b : bytes) {
387            sb.append(String.format("%02x", b & 0xff));
388        }
389        return sb.toString();
390    }
391
392    /**
393     * Convert hex string to byte array. The string length must be an even number.
394     */
395    @UsedForTesting
396    public static byte[] hexStringToByteArray(String hexString) {
397        if (TextUtils.isEmpty(hexString)) {
398            return null;
399        }
400        final int N = hexString.length();
401        if (N % 2 != 0) {
402            throw new NumberFormatException("Input hex string length must be an even number."
403                    + " Length = " + N);
404        }
405        final byte[] bytes = new byte[N / 2];
406        for (int i = 0; i < N; i += 2) {
407            bytes[i / 2] = (byte) ((Character.digit(hexString.charAt(i), 16) << 4)
408                    + Character.digit(hexString.charAt(i + 1), 16));
409        }
410        return bytes;
411    }
412
413    public static List<Object> jsonStrToList(String s) {
414        final ArrayList<Object> retval = CollectionUtils.newArrayList();
415        final JsonReader reader = new JsonReader(new StringReader(s));
416        try {
417            reader.beginArray();
418            while(reader.hasNext()) {
419                reader.beginObject();
420                while (reader.hasNext()) {
421                    final String name = reader.nextName();
422                    if (name.equals(Integer.class.getSimpleName())) {
423                        retval.add(reader.nextInt());
424                    } else if (name.equals(String.class.getSimpleName())) {
425                        retval.add(reader.nextString());
426                    } else {
427                        Log.w(TAG, "Invalid name: " + name);
428                        reader.skipValue();
429                    }
430                }
431                reader.endObject();
432            }
433            reader.endArray();
434            return retval;
435        } catch (IOException e) {
436        } finally {
437            try {
438                reader.close();
439            } catch (IOException e) {
440            }
441        }
442        return Collections.<Object>emptyList();
443    }
444
445    public static String listToJsonStr(List<Object> list) {
446        if (list == null || list.isEmpty()) {
447            return "";
448        }
449        final StringWriter sw = new StringWriter();
450        final JsonWriter writer = new JsonWriter(sw);
451        try {
452            writer.beginArray();
453            for (final Object o : list) {
454                writer.beginObject();
455                if (o instanceof Integer) {
456                    writer.name(Integer.class.getSimpleName()).value((Integer)o);
457                } else if (o instanceof String) {
458                    writer.name(String.class.getSimpleName()).value((String)o);
459                }
460                writer.endObject();
461            }
462            writer.endArray();
463            return sw.toString();
464        } catch (IOException e) {
465        } finally {
466            try {
467                if (writer != null) {
468                    writer.close();
469                }
470            } catch (IOException e) {
471            }
472        }
473        return "";
474    }
475}
476