suggestions_output_utils.cpp revision 5b69472d56eaebecab772012b5a590c5c9ce7511
1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "suggest/core/result/suggestions_output_utils.h"
18
19#include <algorithm>
20#include <vector>
21
22#include "suggest/core/dicnode/dic_node.h"
23#include "suggest/core/dicnode/dic_node_utils.h"
24#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
25#include "suggest/core/dictionary/error_type_utils.h"
26#include "suggest/core/policy/scoring.h"
27#include "suggest/core/result/suggestion_results.h"
28#include "suggest/core/session/dic_traverse_session.h"
29#include "suggest/core/suggest_options.h"
30
31namespace latinime {
32
33const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
34
35/* static */ void SuggestionsOutputUtils::outputSuggestions(
36        const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
37        const float languageWeight, SuggestionResults *const outSuggestionResults) {
38#if DEBUG_EVALUATE_MOST_PROBABLE_STRING
39    const int terminalSize = 0;
40#else
41    const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize();
42#endif
43    std::vector<DicNode> terminals(terminalSize);
44    for (int index = terminalSize - 1; index >= 0; --index) {
45        traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
46    }
47    // Compute a language weight when an invalid language weight is passed.
48    // NOT_A_LANGUAGE_WEIGHT (-1) is assumed as an invalid language weight.
49    const float languageWeightToOutputSuggestions = (languageWeight < 0.0f) ?
50            scoringPolicy->getAdjustedLanguageWeight(
51                    traverseSession, terminals.data(), terminalSize) : languageWeight;
52    outSuggestionResults->setLanguageWeight(languageWeightToOutputSuggestions);
53    // Force autocorrection for obvious long multi-word suggestions when the top suggestion is
54    // a long multiple words suggestion.
55    // TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
56    const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop()
57            && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
58                    && !terminals.empty() && terminals.front().hasMultipleWords());
59    // TODO: have partial commit work even with multiple pointers.
60    const bool outputSecondWordFirstLetterInputIndex =
61            traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
62    const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()->
63            getHeaderStructurePolicy()->shouldBoostExactMatches();
64
65    // Output suggestion results here
66    for (auto &terminalDicNode : terminals) {
67        outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode,
68                languageWeightToOutputSuggestions, boostExactMatches, forceCommitMultiWords,
69                outputSecondWordFirstLetterInputIndex, outSuggestionResults);
70    }
71    scoringPolicy->getMostProbableString(traverseSession, languageWeightToOutputSuggestions,
72            outSuggestionResults);
73}
74
75/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
76        const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
77        const DicNode *const terminalDicNode, const float languageWeight,
78        const bool boostExactMatches, const bool forceCommitMultiWords,
79        const bool outputSecondWordFirstLetterInputIndex,
80        SuggestionResults *const outSuggestionResults) {
81    if (DEBUG_GEO_FULL) {
82        terminalDicNode->dump("OUT:");
83    }
84    const float doubleLetterCost =
85            scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
86    const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
87            + doubleLetterCost;
88    const bool isPossiblyOffensiveWord =
89            traverseSession->getDictionaryStructurePolicy()->getProbability(
90                    terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
91    const bool isExactMatch =
92            ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
93    const bool isExactMatchWithIntentionalOmission =
94            ErrorTypeUtils::isExactMatchWithIntentionalOmission(
95                    terminalDicNode->getContainedErrorTypes());
96    const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
97    // Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
98    // (e.g. "AMD" and "and")
99    const bool isSafeExactMatch = isExactMatch
100            && !(isPossiblyOffensiveWord && isFirstCharUppercase);
101    const int outputTypeFlags =
102            (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
103            | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
104            | (isExactMatchWithIntentionalOmission ?
105                    Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
106
107    // Entries that are blacklisted or do not represent a word should not be output.
108    const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();
109    // When we have to block offensive words, non-exact matched offensive words should not be
110    // output.
111    const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
112    const bool isBlockedOffensiveWord = blockOffensiveWords && isPossiblyOffensiveWord
113            && !isSafeExactMatch;
114
115    // Increase output score of top typing suggestion to ensure autocorrection.
116    // TODO: Better integration with java side autocorrection logic.
117    const int finalScore = scoringPolicy->calculateFinalScore(
118            compoundDistance, traverseSession->getInputSize(),
119            terminalDicNode->getContainedErrorTypes(),
120            (forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
121            boostExactMatches);
122
123    // Don't output invalid or blocked offensive words. However, we still need to submit their
124    // shortcuts if any.
125    if (isValidWord && !isBlockedOffensiveWord) {
126        int codePoints[MAX_WORD_LENGTH];
127        terminalDicNode->outputResult(codePoints);
128        const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
129                terminalDicNode->getSecondWordFirstInputIndex(
130                        traverseSession->getProximityInfoState(0)) :
131                NOT_AN_INDEX;
132        outSuggestionResults->addSuggestion(codePoints,
133                terminalDicNode->getTotalNodeCodePointCount(),
134                finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags,
135                indexToPartialCommit, computeFirstWordConfidence(terminalDicNode));
136    }
137
138    // Output shortcuts.
139    // Shortcut is not supported for multiple words suggestions.
140    // TODO: Check shortcuts during traversal for multiple words suggestions.
141    if (!terminalDicNode->hasMultipleWords()) {
142        BinaryDictionaryShortcutIterator shortcutIt(
143                traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(),
144                traverseSession->getDictionaryStructurePolicy()
145                        ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos()));
146        const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
147        outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults);
148    }
149}
150
151/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(
152        const DicNode *const terminalDicNode) {
153    // Get the number of spaces in the first suggestion
154    const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
155    // Get the number of characters in the first suggestion
156    const int length = terminalDicNode->getTotalNodeCodePointCount();
157    // Get the distance for the first word of the suggestion
158    const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
159
160    // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
161    // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
162    // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
163    // we are very confident.
164    // Expected space count is 1 ~ 5
165    static const int MIN_EXPECTED_SPACE_COUNT = 1;
166    static const int MAX_EXPECTED_SPACE_COUNT = 5;
167    // Expected length is about 4 ~ 30
168    static const int MIN_EXPECTED_LENGTH = 4;
169    static const int MAX_EXPECTED_LENGTH = 30;
170    // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
171    static const float MIN_EXPECTED_DISTANCE = 0.0;
172    static const float MAX_EXPECTED_DISTANCE = 2.0;
173    // This is not strict: it's where most stuff will be falling, but it's still fine if it's
174    // outside these values. We want to output a value that reflects all of these. Each factor
175    // contributes a bit.
176
177    // We need at least a space.
178    if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
179
180    // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
181    // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
182    // weight of the distance. Clamp to avoid overflows.
183    const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
184            : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
185    const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
186            * (MAX_EXPECTED_DISTANCE - clampedDistance)
187            / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
188    // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
189    // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
190    // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
191    const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
192            * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
193    // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
194    // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
195    // weight of the space count.
196    const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
197            * (spaceCount - MIN_EXPECTED_SPACE_COUNT)
198            / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
199
200    return distanceContribution + lengthContribution + spaceContribution;
201}
202
203/* static */ void SuggestionsOutputUtils::outputShortcuts(
204        BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore,
205        const bool sameAsTyped, SuggestionResults *const outSuggestionResults) {
206    int shortcutTarget[MAX_WORD_LENGTH];
207    while (shortcutIt->hasNextShortcutTarget()) {
208        bool isWhilelist;
209        int shortcutTargetStringLength;
210        shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
211                &shortcutTargetStringLength, &isWhilelist);
212        int shortcutScore;
213        int kind;
214        if (isWhilelist && sameAsTyped) {
215            shortcutScore = S_INT_MAX;
216            kind = Dictionary::KIND_WHITELIST;
217        } else {
218            // shortcut entry's score == its base entry's score - 1
219            shortcutScore = finalScore;
220            // Protection against int underflow
221            shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1;
222            kind = Dictionary::KIND_SHORTCUT;
223        }
224        outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength,
225                std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX,
226                NOT_A_FIRST_WORD_CONFIDENCE);
227    }
228}
229} // namespace latinime
230