MainLogBuffer.java revision 565b9d2adda4cae38aa5f6ac10505126d8f10d65
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.research;
18
19import android.util.Log;
20
21import com.android.inputmethod.latin.Dictionary;
22import com.android.inputmethod.latin.Suggest;
23
24import java.util.Random;
25
26public class MainLogBuffer extends LogBuffer {
27    private static final String TAG = MainLogBuffer.class.getSimpleName();
28    // For privacy reasons, be sure to set to "false" for production code.
29    private static final boolean DEBUG = false;
30
31    // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
32    private static final int N_GRAM_SIZE = 2;
33    // The number of words between n-grams to omit from the log.
34    private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES = DEBUG ? 2 : 18;
35
36    private final ResearchLog mResearchLog;
37    private Suggest mSuggest;
38
39    // The minimum periodicity with which n-grams can be sampled.  E.g. mWinWordPeriod is 10 if
40    // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
41    // for 11-18, and the bigram at words 19 and 20.  If an n-gram is not safe (e.g. it  contains a
42    // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
43    // n-gram does appear.
44    /* package for test */ int mMinWordPeriod;
45
46    // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
47    // after a sample is taken.
48    /* package for test */ int mWordsUntilSafeToSample;
49
50    public MainLogBuffer(final ResearchLog researchLog) {
51        super(N_GRAM_SIZE);
52        mResearchLog = researchLog;
53        mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
54        final Random random = new Random();
55        mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
56    }
57
58    public void setSuggest(Suggest suggest) {
59        mSuggest = suggest;
60    }
61
62    @Override
63    public void shiftIn(final LogUnit newLogUnit) {
64        super.shiftIn(newLogUnit);
65        if (newLogUnit.hasWord()) {
66            if (mWordsUntilSafeToSample > 0) {
67                mWordsUntilSafeToSample--;
68            }
69        }
70        if (DEBUG) {
71            Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : ""));
72        }
73    }
74
75    public void resetWordCounter() {
76        mWordsUntilSafeToSample = mMinWordPeriod;
77    }
78
79    /**
80     * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
81     * form and still protect the user's privacy.
82     *
83     * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
84     * non-character data that is typed between words.  The decision about privacy is made based on
85     * the buffer's entire content.  If it is decided that the privacy risks are too great to upload
86     * the contents of this buffer, a censored version of the LogItems may still be uploaded.  E.g.,
87     * the screen orientation and other characteristics about the device can be uploaded without
88     * revealing much about the user.
89     */
90    public boolean isSafeToLog() {
91        // Check that we are not sampling too frequently.  Having sampled recently might disclose
92        // too much of the user's intended meaning.
93        if (mWordsUntilSafeToSample > 0) {
94            return false;
95        }
96        if (mSuggest == null || !mSuggest.hasMainDictionary()) {
97            // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a word
98            // is out-of-vocabulary or not.  Therefore, we must judge the entire buffer contents to
99            // potentially pose a privacy risk.
100            return false;
101        }
102        // Reload the dictionary in case it has changed (e.g., because the user has changed
103        // languages).
104        final Dictionary dictionary = mSuggest.getMainDictionary();
105        if (dictionary == null) {
106            return false;
107        }
108        // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload the
109        // complete buffer contents in detail.
110        final int length = mLogUnits.size();
111        for (int i = 0; i < length; i++) {
112            final LogUnit logUnit = mLogUnits.get(i);
113            final String word = logUnit.getWord();
114            if (word == null) {
115                // Digits outside words are a privacy threat.
116                if (logUnit.hasDigit()) {
117                    return false;
118                }
119            } else {
120                // Words not in the dictionary are a privacy threat.
121                if (!(dictionary.isValidWord(word))) {
122                    return false;
123                }
124            }
125        }
126        // All checks have passed; this buffer's content can be safely uploaded.
127        return true;
128    }
129
130    @Override
131    protected void onShiftOut(LogUnit logUnit) {
132        if (mResearchLog != null) {
133            mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
134        }
135    }
136}
137