MainLogBuffer.java revision 19e05359e641fff2fee410eda5572011926620a5
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.research;
18
19import android.util.Log;
20
21import com.android.inputmethod.latin.Dictionary;
22import com.android.inputmethod.latin.Suggest;
23import com.android.inputmethod.latin.define.ProductionFlag;
24
25import java.util.ArrayList;
26import java.util.LinkedList;
27import java.util.Random;
28
29/**
30 * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
31 *
32 * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
33 * be logged in enough detail to determine their contents, 2) only a subset of words are logged
34 * in detail, such as 10%, and 3) no numbers are logged.
35 *
36 * This class maintains a list of LogUnits, each corresponding to a word.  As the user completes
37 * words, they are added here.  But if the user backs up over their current word to edit a word
38 * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
39 * the LogUnit, and it is pushed back in here when the user is done.  Because words may be pulled
40 * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
41 * quickly.  However, we cannot let the contents pile up either, or it will limit the editing that
42 * a user can perform.
43 *
44 * To balance these requirements (keep history so user can edit, flush history so it does not pile
45 * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
46 * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
47 * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
48 * However, the additional non-detailed words are retained, in case the user backspaces to edit
49 * them.  The MainLogBuffer then continues to add words, publishing individual non-detailed words
50 * as new words arrive.  After enough non-detailed words have been pushed out to account for the
51 * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
52 *
53 * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
54 * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
55 * dictionary words.
56 *
57 * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
58 * n-gram containing dictionary words.
59 */
60public abstract class MainLogBuffer extends FixedLogBuffer {
61    private static final String TAG = MainLogBuffer.class.getSimpleName();
62    private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
63
64    // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
65    public static final int N_GRAM_SIZE = 2;
66
67    // Whether all words should be recorded, leaving unsampled word between bigrams.  Useful for
68    // testing.
69    /* package for test */ static final boolean IS_LOGGING_EVERYTHING = false
70            && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
71
72    // The number of words between n-grams to omit from the log.
73    private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES =
74            IS_LOGGING_EVERYTHING ? 0 : (DEBUG ? 2 : 18);
75
76    private Suggest mSuggest;
77    private boolean mIsStopping = false;
78
79    /* package for test */ int mNumWordsBetweenNGrams;
80
81    // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
82    // after a sample is taken.
83    /* package for test */ int mNumWordsUntilSafeToSample;
84
85    public MainLogBuffer() {
86        super(N_GRAM_SIZE + DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES);
87        mNumWordsBetweenNGrams = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES;
88        final Random random = new Random();
89        mNumWordsUntilSafeToSample = DEBUG ? 0 : random.nextInt(mNumWordsBetweenNGrams + 1);
90    }
91
92    public void setSuggest(final Suggest suggest) {
93        mSuggest = suggest;
94    }
95
96    private Dictionary getDictionary() {
97        if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
98        return mSuggest.getMainDictionary();
99    }
100
101    public void resetWordCounter() {
102        mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
103    }
104
105    public void setIsStopping() {
106        mIsStopping = true;
107    }
108
109    /**
110     * Determines whether uploading the n words at the front the MainLogBuffer will not violate
111     * user privacy.
112     *
113     * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
114     * non-character data that is typed between words.  The decision about privacy is made based on
115     * the buffer's entire content.  If it is decided that the privacy risks are too great to upload
116     * the contents of this buffer, a censored version of the LogItems may still be uploaded.  E.g.,
117     * the screen orientation and other characteristics about the device can be uploaded without
118     * revealing much about the user.
119     */
120    private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
121        // Bypass privacy checks when debugging.
122        if (IS_LOGGING_EVERYTHING) {
123            if (mIsStopping) {
124                return true;
125            }
126            // Only check that it is the right length.  If not, wait for later words to make
127            // complete n-grams.
128            int numWordsInLogUnitList = 0;
129            final int length = logUnits.size();
130            for (int i = 0; i < length; i++) {
131                final LogUnit logUnit = logUnits.get(i);
132                final String word = logUnit.getWord();
133                if (word != null) {
134                    numWordsInLogUnitList++;
135                }
136            }
137            return numWordsInLogUnitList >= minNGramSize;
138        }
139
140        // Check that we are not sampling too frequently.  Having sampled recently might disclose
141        // too much of the user's intended meaning.
142        if (mNumWordsUntilSafeToSample > 0) {
143            return false;
144        }
145        // Reload the dictionary in case it has changed (e.g., because the user has changed
146        // languages).
147        final Dictionary dictionary = getDictionary();
148        if (dictionary == null) {
149            // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a
150            // word is out-of-vocabulary or not.  Therefore, we must judge the entire buffer
151            // contents to potentially pose a privacy risk.
152            return false;
153        }
154
155        // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload
156        // the complete buffer contents in detail.
157        int numWordsInLogUnitList = 0;
158        final int length = logUnits.size();
159        for (int i = 0; i < length; i++) {
160            final LogUnit logUnit = logUnits.get(i);
161            if (!logUnit.hasWord()) {
162                // Digits outside words are a privacy threat.
163                if (logUnit.mayContainDigit()) {
164                    return false;
165                }
166            } else {
167                numWordsInLogUnitList++;
168                final String word = logUnit.getWord();
169                // Words not in the dictionary are a privacy threat.
170                if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
171                    if (DEBUG) {
172                        Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
173                                + ", isValid: " + (dictionary.isValidWord(word)));
174                    }
175                    return false;
176                }
177            }
178        }
179
180        // Finally, only return true if the minNGramSize is met.
181        return numWordsInLogUnitList >= minNGramSize;
182    }
183
184    public void shiftAndPublishAll() {
185        final LinkedList<LogUnit> logUnits = getLogUnits();
186        while (!logUnits.isEmpty()) {
187            publishLogUnitsAtFrontOfBuffer();
188        }
189    }
190
191    @Override
192    protected final void onBufferFull() {
193        publishLogUnitsAtFrontOfBuffer();
194    }
195
196    protected final void publishLogUnitsAtFrontOfBuffer() {
197        ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
198        if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
199            // Good n-gram at the front of the buffer.  Publish it, disclosing details.
200            publish(logUnits, true /* canIncludePrivateData */);
201            shiftOutWords(N_GRAM_SIZE);
202            resetWordCounter();
203        } else {
204            // No good n-gram at front, and buffer is full.  Shift out the first word (or if there
205            // is none, the existing logUnits).
206            logUnits = peekAtFirstNWords(1);
207            publish(logUnits, false /* canIncludePrivateData */);
208            shiftOutWords(1);
209        }
210    }
211
212    /**
213     * Called when a list of logUnits should be published.
214     *
215     * It is the subclass's responsibility to implement the publication.
216     *
217     * @param logUnits The list of logUnits to be published.
218     * @param canIncludePrivateData Whether the private data in the logUnits can be included in
219     * publication.
220     */
221    protected abstract void publish(final ArrayList<LogUnit> logUnits,
222            final boolean canIncludePrivateData);
223
224    @Override
225    protected void shiftOutWords(int numWords) {
226        int oldNumActualWords = getNumActualWords();
227        super.shiftOutWords(numWords);
228        int numWordsShifted = oldNumActualWords - getNumActualWords();
229        mNumWordsUntilSafeToSample -= numWordsShifted;
230        if (DEBUG) {
231            Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
232        }
233    }
234}
235