MainLogBuffer.java revision f77dd424b077a7f8ff547c09cb94d0dc7f0daed7
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.research; 18 19import android.util.Log; 20 21import com.android.inputmethod.latin.Dictionary; 22import com.android.inputmethod.latin.Suggest; 23import com.android.inputmethod.latin.define.ProductionFlag; 24 25import java.util.LinkedList; 26import java.util.Random; 27 28/** 29 * Provide a log buffer of fixed length that enforces privacy restrictions. 30 * 31 * The privacy restrictions include making sure that no numbers are logged, that all logged words 32 * are in the dictionary, and that words are recorded infrequently enough that the user's meaning 33 * cannot be easily determined. 34 */ 35public class MainLogBuffer extends FixedLogBuffer { 36 private static final String TAG = MainLogBuffer.class.getSimpleName(); 37 private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG; 38 39 // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. 40 private static final int N_GRAM_SIZE = 2; 41 // The number of words between n-grams to omit from the log. If debugging, record 50% of all 42 // words. Otherwise, only record 10%. 43 private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES = 44 ProductionFlag.IS_EXPERIMENTAL_DEBUG ? 2 : 18; 45 46 private final ResearchLog mResearchLog; 47 private Suggest mSuggest; 48 49 // The minimum periodicity with which n-grams can be sampled. E.g. mWinWordPeriod is 10 if 50 // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc. 51 // for 11-18, and the bigram at words 19 and 20. If an n-gram is not safe (e.g. it contains a 52 // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe 53 // n-gram does appear. 54 /* package for test */ int mMinWordPeriod; 55 56 // Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod 57 // after a sample is taken. 58 /* package for test */ int mWordsUntilSafeToSample; 59 60 public MainLogBuffer(final ResearchLog researchLog) { 61 super(N_GRAM_SIZE); 62 mResearchLog = researchLog; 63 mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE; 64 final Random random = new Random(); 65 mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod); 66 } 67 68 public void setSuggest(final Suggest suggest) { 69 mSuggest = suggest; 70 } 71 72 @Override 73 public void shiftIn(final LogUnit newLogUnit) { 74 super.shiftIn(newLogUnit); 75 if (newLogUnit.hasWord()) { 76 if (mWordsUntilSafeToSample > 0) { 77 mWordsUntilSafeToSample--; 78 } 79 } 80 if (DEBUG) { 81 Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : "")); 82 } 83 } 84 85 public void resetWordCounter() { 86 mWordsUntilSafeToSample = mMinWordPeriod; 87 } 88 89 /** 90 * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete 91 * form and still protect the user's privacy. 92 * 93 * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any 94 * non-character data that is typed between words. The decision about privacy is made based on 95 * the buffer's entire content. If it is decided that the privacy risks are too great to upload 96 * the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g., 97 * the screen orientation and other characteristics about the device can be uploaded without 98 * revealing much about the user. 99 */ 100 public boolean isSafeToLog() { 101 // Check that we are not sampling too frequently. Having sampled recently might disclose 102 // too much of the user's intended meaning. 103 if (mWordsUntilSafeToSample > 0) { 104 return false; 105 } 106 if (mSuggest == null || !mSuggest.hasMainDictionary()) { 107 // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a word 108 // is out-of-vocabulary or not. Therefore, we must judge the entire buffer contents to 109 // potentially pose a privacy risk. 110 return false; 111 } 112 // Reload the dictionary in case it has changed (e.g., because the user has changed 113 // languages). 114 final Dictionary dictionary = mSuggest.getMainDictionary(); 115 if (dictionary == null) { 116 return false; 117 } 118 // Check each word in the buffer. If any word poses a privacy threat, we cannot upload the 119 // complete buffer contents in detail. 120 final LinkedList<LogUnit> logUnits = getLogUnits(); 121 final int length = logUnits.size(); 122 for (int i = 0; i < length; i++) { 123 final LogUnit logUnit = logUnits.get(i); 124 final String word = logUnit.getWord(); 125 if (word == null) { 126 // Digits outside words are a privacy threat. 127 if (logUnit.mayContainDigit()) { 128 return false; 129 } 130 } else { 131 // Words not in the dictionary are a privacy threat. 132 if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { 133 if (DEBUG) { 134 Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word) 135 + ", isValid: " + (dictionary.isValidWord(word))); 136 } 137 return false; 138 } 139 } 140 } 141 // All checks have passed; this buffer's content can be safely uploaded. 142 return true; 143 } 144 145 @Override 146 protected void onShiftOut(final LogUnit logUnit) { 147 if (mResearchLog != null) { 148 mResearchLog.publish(logUnit, false /* isIncludingPrivateData */); 149 } 150 } 151} 152