MainLogBuffer.java revision f77dd424b077a7f8ff547c09cb94d0dc7f0daed7
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.research;
18
19import android.util.Log;
20
21import com.android.inputmethod.latin.Dictionary;
22import com.android.inputmethod.latin.Suggest;
23import com.android.inputmethod.latin.define.ProductionFlag;
24
25import java.util.LinkedList;
26import java.util.Random;
27
28/**
29 * Provide a log buffer of fixed length that enforces privacy restrictions.
30 *
31 * The privacy restrictions include making sure that no numbers are logged, that all logged words
32 * are in the dictionary, and that words are recorded infrequently enough that the user's meaning
33 * cannot be easily determined.
34 */
35public class MainLogBuffer extends FixedLogBuffer {
36    private static final String TAG = MainLogBuffer.class.getSimpleName();
37    private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
38
39    // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
40    private static final int N_GRAM_SIZE = 2;
41    // The number of words between n-grams to omit from the log.  If debugging, record 50% of all
42    // words.  Otherwise, only record 10%.
43    private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES =
44            ProductionFlag.IS_EXPERIMENTAL_DEBUG ? 2 : 18;
45
46    private final ResearchLog mResearchLog;
47    private Suggest mSuggest;
48
49    // The minimum periodicity with which n-grams can be sampled.  E.g. mWinWordPeriod is 10 if
50    // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
51    // for 11-18, and the bigram at words 19 and 20.  If an n-gram is not safe (e.g. it  contains a
52    // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
53    // n-gram does appear.
54    /* package for test */ int mMinWordPeriod;
55
56    // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
57    // after a sample is taken.
58    /* package for test */ int mWordsUntilSafeToSample;
59
60    public MainLogBuffer(final ResearchLog researchLog) {
61        super(N_GRAM_SIZE);
62        mResearchLog = researchLog;
63        mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
64        final Random random = new Random();
65        mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
66    }
67
68    public void setSuggest(final Suggest suggest) {
69        mSuggest = suggest;
70    }
71
72    @Override
73    public void shiftIn(final LogUnit newLogUnit) {
74        super.shiftIn(newLogUnit);
75        if (newLogUnit.hasWord()) {
76            if (mWordsUntilSafeToSample > 0) {
77                mWordsUntilSafeToSample--;
78            }
79        }
80        if (DEBUG) {
81            Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : ""));
82        }
83    }
84
85    public void resetWordCounter() {
86        mWordsUntilSafeToSample = mMinWordPeriod;
87    }
88
89    /**
90     * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
91     * form and still protect the user's privacy.
92     *
93     * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
94     * non-character data that is typed between words.  The decision about privacy is made based on
95     * the buffer's entire content.  If it is decided that the privacy risks are too great to upload
96     * the contents of this buffer, a censored version of the LogItems may still be uploaded.  E.g.,
97     * the screen orientation and other characteristics about the device can be uploaded without
98     * revealing much about the user.
99     */
100    public boolean isSafeToLog() {
101        // Check that we are not sampling too frequently.  Having sampled recently might disclose
102        // too much of the user's intended meaning.
103        if (mWordsUntilSafeToSample > 0) {
104            return false;
105        }
106        if (mSuggest == null || !mSuggest.hasMainDictionary()) {
107            // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a word
108            // is out-of-vocabulary or not.  Therefore, we must judge the entire buffer contents to
109            // potentially pose a privacy risk.
110            return false;
111        }
112        // Reload the dictionary in case it has changed (e.g., because the user has changed
113        // languages).
114        final Dictionary dictionary = mSuggest.getMainDictionary();
115        if (dictionary == null) {
116            return false;
117        }
118        // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload the
119        // complete buffer contents in detail.
120        final LinkedList<LogUnit> logUnits = getLogUnits();
121        final int length = logUnits.size();
122        for (int i = 0; i < length; i++) {
123            final LogUnit logUnit = logUnits.get(i);
124            final String word = logUnit.getWord();
125            if (word == null) {
126                // Digits outside words are a privacy threat.
127                if (logUnit.mayContainDigit()) {
128                    return false;
129                }
130            } else {
131                // Words not in the dictionary are a privacy threat.
132                if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
133                    if (DEBUG) {
134                        Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
135                                + ", isValid: " + (dictionary.isValidWord(word)));
136                    }
137                    return false;
138                }
139            }
140        }
141        // All checks have passed; this buffer's content can be safely uploaded.
142        return true;
143    }
144
145    @Override
146    protected void onShiftOut(final LogUnit logUnit) {
147        if (mResearchLog != null) {
148            mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
149        }
150    }
151}
152