MainLogBuffer.java revision 19e05359e641fff2fee410eda5572011926620a5
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.research; 18 19import android.util.Log; 20 21import com.android.inputmethod.latin.Dictionary; 22import com.android.inputmethod.latin.Suggest; 23import com.android.inputmethod.latin.define.ProductionFlag; 24 25import java.util.ArrayList; 26import java.util.LinkedList; 27import java.util.Random; 28 29/** 30 * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees. 31 * 32 * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to 33 * be logged in enough detail to determine their contents, 2) only a subset of words are logged 34 * in detail, such as 10%, and 3) no numbers are logged. 35 * 36 * This class maintains a list of LogUnits, each corresponding to a word. As the user completes 37 * words, they are added here. But if the user backs up over their current word to edit a word 38 * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of 39 * the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled 40 * back out even after they are pushed in, we must not publish the contents of this LogBuffer too 41 * quickly. However, we cannot let the contents pile up either, or it will limit the editing that 42 * a user can perform. 43 * 44 * To balance these requirements (keep history so user can edit, flush history so it does not pile 45 * up), the LogBuffer is considered "complete" when the user has entered enough words to form an 46 * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above). 47 * Once complete, the n-gram may be published to flash storage (via the ResearchLog class). 48 * However, the additional non-detailed words are retained, in case the user backspaces to edit 49 * them. The MainLogBuffer then continues to add words, publishing individual non-detailed words 50 * as new words arrive. After enough non-detailed words have been pushed out to account for the 51 * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again. 52 * 53 * If the words that would form the valid n-gram are not in the dictionary, then words are pushed 54 * through the LogBuffer one at a time until an n-gram is found that is entirely composed of 55 * dictionary words. 56 * 57 * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded 58 * n-gram containing dictionary words. 59 */ 60public abstract class MainLogBuffer extends FixedLogBuffer { 61 private static final String TAG = MainLogBuffer.class.getSimpleName(); 62 private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG; 63 64 // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. 65 public static final int N_GRAM_SIZE = 2; 66 67 // Whether all words should be recorded, leaving unsampled word between bigrams. Useful for 68 // testing. 69 /* package for test */ static final boolean IS_LOGGING_EVERYTHING = false 70 && ProductionFlag.IS_EXPERIMENTAL_DEBUG; 71 72 // The number of words between n-grams to omit from the log. 73 private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES = 74 IS_LOGGING_EVERYTHING ? 0 : (DEBUG ? 2 : 18); 75 76 private Suggest mSuggest; 77 private boolean mIsStopping = false; 78 79 /* package for test */ int mNumWordsBetweenNGrams; 80 81 // Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod 82 // after a sample is taken. 83 /* package for test */ int mNumWordsUntilSafeToSample; 84 85 public MainLogBuffer() { 86 super(N_GRAM_SIZE + DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES); 87 mNumWordsBetweenNGrams = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES; 88 final Random random = new Random(); 89 mNumWordsUntilSafeToSample = DEBUG ? 0 : random.nextInt(mNumWordsBetweenNGrams + 1); 90 } 91 92 public void setSuggest(final Suggest suggest) { 93 mSuggest = suggest; 94 } 95 96 private Dictionary getDictionary() { 97 if (mSuggest == null || !mSuggest.hasMainDictionary()) return null; 98 return mSuggest.getMainDictionary(); 99 } 100 101 public void resetWordCounter() { 102 mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams; 103 } 104 105 public void setIsStopping() { 106 mIsStopping = true; 107 } 108 109 /** 110 * Determines whether uploading the n words at the front the MainLogBuffer will not violate 111 * user privacy. 112 * 113 * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any 114 * non-character data that is typed between words. The decision about privacy is made based on 115 * the buffer's entire content. If it is decided that the privacy risks are too great to upload 116 * the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g., 117 * the screen orientation and other characteristics about the device can be uploaded without 118 * revealing much about the user. 119 */ 120 private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) { 121 // Bypass privacy checks when debugging. 122 if (IS_LOGGING_EVERYTHING) { 123 if (mIsStopping) { 124 return true; 125 } 126 // Only check that it is the right length. If not, wait for later words to make 127 // complete n-grams. 128 int numWordsInLogUnitList = 0; 129 final int length = logUnits.size(); 130 for (int i = 0; i < length; i++) { 131 final LogUnit logUnit = logUnits.get(i); 132 final String word = logUnit.getWord(); 133 if (word != null) { 134 numWordsInLogUnitList++; 135 } 136 } 137 return numWordsInLogUnitList >= minNGramSize; 138 } 139 140 // Check that we are not sampling too frequently. Having sampled recently might disclose 141 // too much of the user's intended meaning. 142 if (mNumWordsUntilSafeToSample > 0) { 143 return false; 144 } 145 // Reload the dictionary in case it has changed (e.g., because the user has changed 146 // languages). 147 final Dictionary dictionary = getDictionary(); 148 if (dictionary == null) { 149 // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a 150 // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer 151 // contents to potentially pose a privacy risk. 152 return false; 153 } 154 155 // Check each word in the buffer. If any word poses a privacy threat, we cannot upload 156 // the complete buffer contents in detail. 157 int numWordsInLogUnitList = 0; 158 final int length = logUnits.size(); 159 for (int i = 0; i < length; i++) { 160 final LogUnit logUnit = logUnits.get(i); 161 if (!logUnit.hasWord()) { 162 // Digits outside words are a privacy threat. 163 if (logUnit.mayContainDigit()) { 164 return false; 165 } 166 } else { 167 numWordsInLogUnitList++; 168 final String word = logUnit.getWord(); 169 // Words not in the dictionary are a privacy threat. 170 if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { 171 if (DEBUG) { 172 Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word) 173 + ", isValid: " + (dictionary.isValidWord(word))); 174 } 175 return false; 176 } 177 } 178 } 179 180 // Finally, only return true if the minNGramSize is met. 181 return numWordsInLogUnitList >= minNGramSize; 182 } 183 184 public void shiftAndPublishAll() { 185 final LinkedList<LogUnit> logUnits = getLogUnits(); 186 while (!logUnits.isEmpty()) { 187 publishLogUnitsAtFrontOfBuffer(); 188 } 189 } 190 191 @Override 192 protected final void onBufferFull() { 193 publishLogUnitsAtFrontOfBuffer(); 194 } 195 196 protected final void publishLogUnitsAtFrontOfBuffer() { 197 ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE); 198 if (isSafeNGram(logUnits, N_GRAM_SIZE)) { 199 // Good n-gram at the front of the buffer. Publish it, disclosing details. 200 publish(logUnits, true /* canIncludePrivateData */); 201 shiftOutWords(N_GRAM_SIZE); 202 resetWordCounter(); 203 } else { 204 // No good n-gram at front, and buffer is full. Shift out the first word (or if there 205 // is none, the existing logUnits). 206 logUnits = peekAtFirstNWords(1); 207 publish(logUnits, false /* canIncludePrivateData */); 208 shiftOutWords(1); 209 } 210 } 211 212 /** 213 * Called when a list of logUnits should be published. 214 * 215 * It is the subclass's responsibility to implement the publication. 216 * 217 * @param logUnits The list of logUnits to be published. 218 * @param canIncludePrivateData Whether the private data in the logUnits can be included in 219 * publication. 220 */ 221 protected abstract void publish(final ArrayList<LogUnit> logUnits, 222 final boolean canIncludePrivateData); 223 224 @Override 225 protected void shiftOutWords(int numWords) { 226 int oldNumActualWords = getNumActualWords(); 227 super.shiftOutWords(numWords); 228 int numWordsShifted = oldNumActualWords - getNumActualWords(); 229 mNumWordsUntilSafeToSample -= numWordsShifted; 230 if (DEBUG) { 231 Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample); 232 } 233 } 234} 235