1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.utils;
18
19import android.text.InputType;
20import android.text.TextUtils;
21
22import com.android.inputmethod.latin.WordComposer;
23import com.android.inputmethod.latin.common.Constants;
24import com.android.inputmethod.latin.common.StringUtils;
25import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
26
27import java.util.ArrayList;
28import java.util.Locale;
29
30public final class CapsModeUtils {
31    private CapsModeUtils() {
32        // This utility class is not publicly instantiable.
33    }
34
35    /**
36     * Apply an auto-caps mode to a string.
37     *
38     * This intentionally does NOT apply manual caps mode. It only changes the capitalization if
39     * the mode is one of the auto-caps modes.
40     * @param s The string to capitalize.
41     * @param capitalizeMode The mode in which to capitalize.
42     * @param locale The locale for capitalizing.
43     * @return The capitalized string.
44     */
45    public static String applyAutoCapsMode(final String s, final int capitalizeMode,
46            final Locale locale) {
47        if (WordComposer.CAPS_MODE_AUTO_SHIFT_LOCKED == capitalizeMode) {
48            return s.toUpperCase(locale);
49        } else if (WordComposer.CAPS_MODE_AUTO_SHIFTED == capitalizeMode) {
50            return StringUtils.capitalizeFirstCodePoint(s, locale);
51        } else {
52            return s;
53        }
54    }
55
56    /**
57     * Return whether a constant represents an auto-caps mode (either auto-shift or auto-shift-lock)
58     * @param mode The mode to test for
59     * @return true if this represents an auto-caps mode, false otherwise
60     */
61    public static boolean isAutoCapsMode(final int mode) {
62        return WordComposer.CAPS_MODE_AUTO_SHIFTED == mode
63                || WordComposer.CAPS_MODE_AUTO_SHIFT_LOCKED == mode;
64    }
65
66    /**
67     * Helper method to find out if a code point is starting punctuation.
68     *
69     * This include the Unicode START_PUNCTUATION category, but also some other symbols that are
70     * starting, like the inverted question mark or the double quote.
71     *
72     * @param codePoint the code point
73     * @return true if it's starting punctuation, false otherwise.
74     */
75    private static boolean isStartPunctuation(final int codePoint) {
76        return (codePoint == Constants.CODE_DOUBLE_QUOTE || codePoint == Constants.CODE_SINGLE_QUOTE
77                || codePoint == Constants.CODE_INVERTED_QUESTION_MARK
78                || codePoint == Constants.CODE_INVERTED_EXCLAMATION_MARK
79                || Character.getType(codePoint) == Character.START_PUNCTUATION);
80    }
81
82    /**
83     * Determine what caps mode should be in effect at the current offset in
84     * the text. Only the mode bits set in <var>reqModes</var> will be
85     * checked. Note that the caps mode flags here are explicitly defined
86     * to match those in {@link InputType}.
87     *
88     * This code is a straight copy of TextUtils.getCapsMode (modulo namespace and formatting
89     * issues). This will change in the future as we simplify the code for our use and fix bugs.
90     *
91     * @param cs The text that should be checked for caps modes.
92     * @param reqModes The modes to be checked: may be any combination of
93     * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and
94     * {@link TextUtils#CAP_MODE_SENTENCES}.
95     * @param spacingAndPunctuations The current spacing and punctuations settings.
96     * @param hasSpaceBefore Whether we should consider there is a space inserted at the end of cs
97     *
98     * @return Returns the actual capitalization modes that can be in effect
99     * at the current position, which is any combination of
100     * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and
101     * {@link TextUtils#CAP_MODE_SENTENCES}.
102     */
103    public static int getCapsMode(final CharSequence cs, final int reqModes,
104            final SpacingAndPunctuations spacingAndPunctuations, final boolean hasSpaceBefore) {
105        // Quick description of what we want to do:
106        // CAP_MODE_CHARACTERS is always on.
107        // CAP_MODE_WORDS is on if there is some whitespace before the cursor.
108        // CAP_MODE_SENTENCES is on if there is some whitespace before the cursor, and the end
109        //   of a sentence just before that.
110        // We ignore opening parentheses and the like just before the cursor for purposes of
111        // finding whitespace for WORDS and SENTENCES modes.
112        // The end of a sentence ends with a period, question mark or exclamation mark. If it's
113        // a period, it also needs not to be an abbreviation, which means it also needs to either
114        // be immediately preceded by punctuation, or by a string of only letters with single
115        // periods interleaved.
116
117        // Step 1 : check for cap MODE_CHARACTERS. If it's looked for, it's always on.
118        if ((reqModes & (TextUtils.CAP_MODE_WORDS | TextUtils.CAP_MODE_SENTENCES)) == 0) {
119            // Here we are not looking for MODE_WORDS or MODE_SENTENCES, so since we already
120            // evaluated MODE_CHARACTERS, we can return.
121            return TextUtils.CAP_MODE_CHARACTERS & reqModes;
122        }
123
124        // Step 2 : Skip (ignore at the end of input) any opening punctuation. This includes
125        // opening parentheses, brackets, opening quotes, everything that *opens* a span of
126        // text in the linguistic sense. In RTL languages, this is still an opening sign, although
127        // it may look like a right parenthesis for example. We also include double quote and
128        // single quote since they aren't start punctuation in the unicode sense, but should still
129        // be skipped for English. TODO: does this depend on the language?
130        int i;
131        if (hasSpaceBefore) {
132            i = cs.length() + 1;
133        } else {
134            for (i = cs.length(); i > 0; i--) {
135                final char c = cs.charAt(i - 1);
136                if (!isStartPunctuation(c)) {
137                    break;
138                }
139            }
140        }
141
142        // We are now on the character that precedes any starting punctuation, so in the most
143        // frequent case this will be whitespace or a letter, although it may occasionally be a
144        // start of line, or some symbol.
145
146        // Step 3 : Search for the start of a paragraph. From the starting point computed in step 2,
147        // we go back over any space or tab char sitting there. We find the start of a paragraph
148        // if the first char that's not a space or tab is a start of line (as in \n, start of text,
149        // or some other similar characters).
150        int j = i;
151        char prevChar = Constants.CODE_SPACE;
152        if (hasSpaceBefore) --j;
153        while (j > 0) {
154            prevChar = cs.charAt(j - 1);
155            if (!Character.isSpaceChar(prevChar) && prevChar != Constants.CODE_TAB) break;
156            j--;
157        }
158        if (j <= 0 || Character.isWhitespace(prevChar)) {
159            if (spacingAndPunctuations.mUsesGermanRules) {
160                // In German typography rules, there is a specific case that the first character
161                // of a new line should not be capitalized if the previous line ends in a comma.
162                boolean hasNewLine = false;
163                while (--j >= 0 && Character.isWhitespace(prevChar)) {
164                    if (Constants.CODE_ENTER == prevChar) {
165                        hasNewLine = true;
166                    }
167                    prevChar = cs.charAt(j);
168                }
169                if (Constants.CODE_COMMA == prevChar && hasNewLine) {
170                    return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
171                }
172            }
173            // There are only spacing chars between the start of the paragraph and the cursor,
174            // defined as a isWhitespace() char that is neither a isSpaceChar() nor a tab. Both
175            // MODE_WORDS and MODE_SENTENCES should be active.
176            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
177                    | TextUtils.CAP_MODE_SENTENCES) & reqModes;
178        }
179        if (i == j) {
180            // If we don't have whitespace before index i, it means neither MODE_WORDS
181            // nor mode sentences should be on so we can return right away.
182            return TextUtils.CAP_MODE_CHARACTERS & reqModes;
183        }
184        if ((reqModes & TextUtils.CAP_MODE_SENTENCES) == 0) {
185            // Here we know we have whitespace before the cursor (if not, we returned in the above
186            // if i == j clause), so we need MODE_WORDS to be on. And we don't need to evaluate
187            // MODE_SENTENCES so we can return right away.
188            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
189        }
190        // Please note that because of the reqModes & CAP_MODE_SENTENCES test a few lines above,
191        // we know that MODE_SENTENCES is being requested.
192
193        // Step 4 : Search for MODE_SENTENCES.
194        // English is a special case in that "American typography" rules, which are the most common
195        // in English, state that a sentence terminator immediately following a quotation mark
196        // should be swapped with it and de-duplicated (included in the quotation mark),
197        // e.g. <<Did he say, "let's go home?">>
198        // No other language has such a rule as far as I know, instead putting inside the quotation
199        // mark as the exact thing quoted and handling the surrounding punctuation independently,
200        // e.g. <<Did he say, "let's go home"?>>
201        if (spacingAndPunctuations.mUsesAmericanTypography) {
202            for (; j > 0; j--) {
203                // Here we look to go over any closing punctuation. This is because in dominant
204                // variants of English, the final period is placed within double quotes and maybe
205                // other closing punctuation signs. This is generally not true in other languages.
206                final char c = cs.charAt(j - 1);
207                if (c != Constants.CODE_DOUBLE_QUOTE && c != Constants.CODE_SINGLE_QUOTE
208                        && Character.getType(c) != Character.END_PUNCTUATION) {
209                    break;
210                }
211            }
212        }
213
214        if (j <= 0) return TextUtils.CAP_MODE_CHARACTERS & reqModes;
215        char c = cs.charAt(--j);
216
217        // We found the next interesting chunk of text ; next we need to determine if it's the
218        // end of a sentence. If we have a sentence terminator (typically a question mark or an
219        // exclamation mark), then it's the end of a sentence; however, we treat the abbreviation
220        // marker specially because usually is the same char as the sentence separator (the
221        // period in most languages) and in this case we need to apply a heuristic to determine
222        // in which of these senses it's used.
223        if (spacingAndPunctuations.isSentenceTerminator(c)
224                && !spacingAndPunctuations.isAbbreviationMarker(c)) {
225            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
226                    | TextUtils.CAP_MODE_SENTENCES) & reqModes;
227        }
228        // If we reach here, we know we have whitespace before the cursor and before that there
229        // is something that either does not terminate the sentence, or a symbol preceded by the
230        // start of the text, or it's the sentence separator AND it happens to be the same code
231        // point as the abbreviation marker.
232        // If it's a symbol or something that does not terminate the sentence, then we need to
233        // return caps for MODE_CHARACTERS and MODE_WORDS, but not for MODE_SENTENCES.
234        if (!spacingAndPunctuations.isSentenceSeparator(c) || j <= 0) {
235            return (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
236        }
237
238        // We found out that we have a period. We need to determine if this is a full stop or
239        // otherwise sentence-ending period, or an abbreviation like "e.g.". An abbreviation
240        // looks like (\w\.){2,}. Moreover, in German, you put periods after digits for dates
241        // and some other things, and in German specifically we need to not go into autocaps after
242        // a whitespace-digits-period sequence.
243        // To find out, we will have a simple state machine with the following states :
244        // START, WORD, PERIOD, ABBREVIATION, NUMBER
245        // On START : (just before the first period)
246        //           letter => WORD
247        //           digit => NUMBER if German; end with caps otherwise
248        //           whitespace => end with no caps (it was a stand-alone period)
249        //           otherwise => end with caps (several periods/symbols in a row)
250        // On WORD : (within the word just before the first period)
251        //           letter => WORD
252        //           period => PERIOD
253        //           otherwise => end with caps (it was a word with a full stop at the end)
254        // On PERIOD : (period within a potential abbreviation)
255        //           letter => LETTER
256        //           otherwise => end with caps (it was not an abbreviation)
257        // On LETTER : (letter within a potential abbreviation)
258        //           letter => LETTER
259        //           period => PERIOD
260        //           otherwise => end with no caps (it was an abbreviation)
261        // On NUMBER : (period immediately preceded by one or more digits)
262        //           digit => NUMBER
263        //           letter => LETTER (promote to word)
264        //           otherwise => end with no caps (it was a whitespace-digits-period sequence,
265        //            or a punctuation-digits-period sequence like "11.11.")
266        // "Not an abbreviation" in the above chart essentially covers cases like "...yes.". This
267        // should capitalize.
268
269        final int START = 0;
270        final int WORD = 1;
271        final int PERIOD = 2;
272        final int LETTER = 3;
273        final int NUMBER = 4;
274        final int caps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
275                | TextUtils.CAP_MODE_SENTENCES) & reqModes;
276        final int noCaps = (TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS) & reqModes;
277        int state = START;
278        while (j > 0) {
279            c = cs.charAt(--j);
280            switch (state) {
281            case START:
282                if (Character.isLetter(c)) {
283                    state = WORD;
284                } else if (Character.isWhitespace(c)) {
285                    return noCaps;
286                } else if (Character.isDigit(c) && spacingAndPunctuations.mUsesGermanRules) {
287                    state = NUMBER;
288                } else {
289                    return caps;
290                }
291                break;
292            case WORD:
293                if (Character.isLetter(c)) {
294                    state = WORD;
295                } else if (spacingAndPunctuations.isSentenceSeparator(c)) {
296                    state = PERIOD;
297                } else {
298                    return caps;
299                }
300                break;
301            case PERIOD:
302                if (Character.isLetter(c)) {
303                    state = LETTER;
304                } else {
305                    return caps;
306                }
307                break;
308            case LETTER:
309                if (Character.isLetter(c)) {
310                    state = LETTER;
311                } else if (spacingAndPunctuations.isSentenceSeparator(c)) {
312                    state = PERIOD;
313                } else {
314                    return noCaps;
315                }
316                break;
317            case NUMBER:
318                if (Character.isLetter(c)) {
319                    state = WORD;
320                } else if (Character.isDigit(c)) {
321                    state = NUMBER;
322                } else {
323                    return noCaps;
324                }
325            }
326        }
327        // Here we arrived at the start of the line. This should behave exactly like whitespace.
328        return (START == state || LETTER == state) ? noCaps : caps;
329    }
330
331    /**
332     * Convert capitalize mode flags into human readable text.
333     *
334     * @param capsFlags The modes flags to be converted. It may be any combination of
335     * {@link TextUtils#CAP_MODE_CHARACTERS}, {@link TextUtils#CAP_MODE_WORDS}, and
336     * {@link TextUtils#CAP_MODE_SENTENCES}.
337     * @return the text that describe the <code>capsMode</code>.
338     */
339    public static String flagsToString(final int capsFlags) {
340        final int capsFlagsMask = TextUtils.CAP_MODE_CHARACTERS | TextUtils.CAP_MODE_WORDS
341                | TextUtils.CAP_MODE_SENTENCES;
342        if ((capsFlags & ~capsFlagsMask) != 0) {
343            return "unknown<0x" + Integer.toHexString(capsFlags) + ">";
344        }
345        final ArrayList<String> builder = new ArrayList<>();
346        if ((capsFlags & android.text.TextUtils.CAP_MODE_CHARACTERS) != 0) {
347            builder.add("characters");
348        }
349        if ((capsFlags & android.text.TextUtils.CAP_MODE_WORDS) != 0) {
350            builder.add("words");
351        }
352        if ((capsFlags & android.text.TextUtils.CAP_MODE_SENTENCES) != 0) {
353            builder.add("sentences");
354        }
355        return builder.isEmpty() ? "none" : TextUtils.join("|", builder);
356    }
357}
358