suggestions_output_utils.cpp revision ff1b3947c6c578c8073902d0834600bcbdd45763
1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "suggest/core/result/suggestions_output_utils.h"
18
19#include <algorithm>
20#include <vector>
21
22#include "suggest/core/dicnode/dic_node.h"
23#include "suggest/core/dicnode/dic_node_utils.h"
24#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
25#include "suggest/core/dictionary/error_type_utils.h"
26#include "suggest/core/policy/scoring.h"
27#include "suggest/core/result/suggestion_results.h"
28#include "suggest/core/session/dic_traverse_session.h"
29
30namespace latinime {
31
32const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
33
34/* static */ void SuggestionsOutputUtils::outputSuggestions(
35        const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
36        SuggestionResults *const outSuggestionResults) {
37#if DEBUG_EVALUATE_MOST_PROBABLE_STRING
38    const int terminalSize = 0;
39#else
40    const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize();
41#endif
42    std::vector<DicNode> terminals(terminalSize);
43    for (int index = terminalSize - 1; index >= 0; --index) {
44        traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
45    }
46
47    const float languageWeight = scoringPolicy->getAdjustedLanguageWeight(
48            traverseSession, terminals.data(), terminalSize);
49    // Force autocorrection for obvious long multi-word suggestions when the top suggestion is
50    // a long multiple words suggestion.
51    // TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
52    const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop()
53            && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
54                    && !terminals.empty() && terminals.front().hasMultipleWords());
55    // TODO: have partial commit work even with multiple pointers.
56    const bool outputSecondWordFirstLetterInputIndex =
57            traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
58    const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()->
59            getHeaderStructurePolicy()->shouldBoostExactMatches();
60
61    // Output suggestion results here
62    for (auto &terminalDicNode : terminals) {
63        outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode,
64                languageWeight, boostExactMatches, forceCommitMultiWords,
65                outputSecondWordFirstLetterInputIndex, outSuggestionResults);
66    }
67    scoringPolicy->getMostProbableString(traverseSession, languageWeight, outSuggestionResults);
68}
69
70/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
71        const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
72        const DicNode *const terminalDicNode, const float languageWeight,
73        const bool boostExactMatches, const bool forceCommitMultiWords,
74        const bool outputSecondWordFirstLetterInputIndex,
75        SuggestionResults *const outSuggestionResults) {
76    if (DEBUG_GEO_FULL) {
77        terminalDicNode->dump("OUT:");
78    }
79    const float doubleLetterCost =
80            scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
81    const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
82            + doubleLetterCost;
83    const bool isPossiblyOffensiveWord =
84            traverseSession->getDictionaryStructurePolicy()->getProbability(
85                    terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
86    const bool isExactMatch =
87            ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
88    const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
89    // Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
90    // (e.g. "AMD" and "and")
91    const bool isSafeExactMatch = isExactMatch
92            && !(isPossiblyOffensiveWord && isFirstCharUppercase);
93    const int outputTypeFlags =
94            (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
95            | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0);
96
97    // Entries that are blacklisted or do not represent a word should not be output.
98    const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();
99
100    // Increase output score of top typing suggestion to ensure autocorrection.
101    // TODO: Better integration with java side autocorrection logic.
102    const int finalScore = scoringPolicy->calculateFinalScore(
103            compoundDistance, traverseSession->getInputSize(),
104            terminalDicNode->getContainedErrorTypes(),
105            (forceCommitMultiWords && terminalDicNode->hasMultipleWords())
106                     || (isValidWord && scoringPolicy->doesAutoCorrectValidWord()),
107            boostExactMatches);
108
109    // Don't output invalid words. However, we still need to submit their shortcuts if any.
110    if (isValidWord) {
111        int codePoints[MAX_WORD_LENGTH];
112        terminalDicNode->outputResult(codePoints);
113        const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
114                terminalDicNode->getSecondWordFirstInputIndex(
115                        traverseSession->getProximityInfoState(0)) :
116                NOT_AN_INDEX;
117        outSuggestionResults->addSuggestion(codePoints,
118                terminalDicNode->getTotalNodeCodePointCount(),
119                finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags,
120                indexToPartialCommit, computeFirstWordConfidence(terminalDicNode));
121    }
122
123    // Output shortcuts.
124    // Shortcut is not supported for multiple words suggestions.
125    // TODO: Check shortcuts during traversal for multiple words suggestions.
126    if (!terminalDicNode->hasMultipleWords()) {
127        BinaryDictionaryShortcutIterator shortcutIt(
128                traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(),
129                traverseSession->getDictionaryStructurePolicy()
130                        ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos()));
131        const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
132        const int shortcutBaseScore = scoringPolicy->doesAutoCorrectValidWord() ?
133                 scoringPolicy->calculateFinalScore(compoundDistance,
134                         traverseSession->getInputSize(),
135                         terminalDicNode->getContainedErrorTypes(),
136                         true /* forceCommit */, boostExactMatches) : finalScore;
137        outputShortcuts(&shortcutIt, shortcutBaseScore, sameAsTyped, outSuggestionResults);
138    }
139}
140
141/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(
142        const DicNode *const terminalDicNode) {
143    // Get the number of spaces in the first suggestion
144    const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
145    // Get the number of characters in the first suggestion
146    const int length = terminalDicNode->getTotalNodeCodePointCount();
147    // Get the distance for the first word of the suggestion
148    const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
149
150    // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
151    // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
152    // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
153    // we are very confident.
154    // Expected space count is 1 ~ 5
155    static const int MIN_EXPECTED_SPACE_COUNT = 1;
156    static const int MAX_EXPECTED_SPACE_COUNT = 5;
157    // Expected length is about 4 ~ 30
158    static const int MIN_EXPECTED_LENGTH = 4;
159    static const int MAX_EXPECTED_LENGTH = 30;
160    // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
161    static const float MIN_EXPECTED_DISTANCE = 0.0;
162    static const float MAX_EXPECTED_DISTANCE = 2.0;
163    // This is not strict: it's where most stuff will be falling, but it's still fine if it's
164    // outside these values. We want to output a value that reflects all of these. Each factor
165    // contributes a bit.
166
167    // We need at least a space.
168    if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
169
170    // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
171    // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
172    // weight of the distance. Clamp to avoid overflows.
173    const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
174            : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
175    const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
176            * (MAX_EXPECTED_DISTANCE - clampedDistance)
177            / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
178    // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
179    // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
180    // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
181    const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
182            * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
183    // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
184    // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
185    // weight of the space count.
186    const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
187            * (spaceCount - MIN_EXPECTED_SPACE_COUNT)
188            / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
189
190    return distanceContribution + lengthContribution + spaceContribution;
191}
192
193/* static */ void SuggestionsOutputUtils::outputShortcuts(
194        BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore,
195        const bool sameAsTyped, SuggestionResults *const outSuggestionResults) {
196    int shortcutTarget[MAX_WORD_LENGTH];
197    while (shortcutIt->hasNextShortcutTarget()) {
198        bool isWhilelist;
199        int shortcutTargetStringLength;
200        shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
201                &shortcutTargetStringLength, &isWhilelist);
202        int shortcutScore;
203        int kind;
204        if (isWhilelist && sameAsTyped) {
205            shortcutScore = S_INT_MAX;
206            kind = Dictionary::KIND_WHITELIST;
207        } else {
208            // shortcut entry's score == its base entry's score - 1
209            shortcutScore = finalScore;
210            // Protection against int underflow
211            shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1;
212            kind = Dictionary::KIND_SHORTCUT;
213        }
214        outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength,
215                std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX,
216                NOT_A_FIRST_WORD_CONFIDENCE);
217    }
218}
219} // namespace latinime
220