suggestions_output_utils.cpp revision a0ea92f76c2e73e4b4bbb59680bc6cd3c33f0573
1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "suggest/core/result/suggestions_output_utils.h"
18
19#include <algorithm>
20
21#include "suggest/core/dicnode/dic_node.h"
22#include "suggest/core/dicnode/dic_node_utils.h"
23#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
24#include "suggest/core/dictionary/dictionary.h"
25#include "suggest/core/dictionary/error_type_utils.h"
26#include "suggest/core/policy/scoring.h"
27#include "suggest/core/result/suggestion_results.h"
28#include "suggest/core/session/dic_traverse_session.h"
29
30namespace latinime {
31
32const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
33
34// TODO: Split this method.
35/* static */ void SuggestionsOutputUtils::outputSuggestions(
36        const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
37        SuggestionResults *const outSuggestionResults) {
38#if DEBUG_EVALUATE_MOST_PROBABLE_STRING
39    const int terminalSize = 0;
40#else
41    const int terminalSize = std::min(MAX_RESULTS,
42            static_cast<int>(traverseSession->getDicTraverseCache()->terminalSize()));
43#endif
44    DicNode terminals[MAX_RESULTS]; // Avoiding non-POD variable length array
45
46    for (int index = terminalSize - 1; index >= 0; --index) {
47        traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
48    }
49
50    const float languageWeight = scoringPolicy->getAdjustedLanguageWeight(
51            traverseSession, terminals, terminalSize);
52    // Force autocorrection for obvious long multi-word suggestions when the top suggestion is
53    // a long multiple words suggestion.
54    // TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
55    const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop()
56            && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT
57                    && terminals[0].hasMultipleWords());
58    // TODO: have partial commit work even with multiple pointers.
59    const bool outputSecondWordFirstLetterInputIndex =
60            traverseSession->isOnlyOnePointerUsed(0 /* pointerId */);
61    const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()->
62            getHeaderStructurePolicy()->shouldBoostExactMatches();
63
64    int codePoints[MAX_WORD_LENGTH];
65    // Output suggestion results here
66    for (int terminalIndex = 0; terminalIndex < terminalSize; ++terminalIndex) {
67        DicNode *terminalDicNode = &terminals[terminalIndex];
68        if (DEBUG_GEO_FULL) {
69            terminalDicNode->dump("OUT:");
70        }
71        const float doubleLetterCost =
72                scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
73        const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
74                + doubleLetterCost;
75        const bool isPossiblyOffensiveWord =
76                traverseSession->getDictionaryStructurePolicy()->getProbability(
77                        terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
78        const bool isExactMatch =
79                ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
80        const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
81        // Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
82        // (e.g. "AMD" and "and")
83        const bool isSafeExactMatch = isExactMatch
84                && !(isPossiblyOffensiveWord && isFirstCharUppercase);
85        const int outputTypeFlags =
86                (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
87                | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0);
88
89        // Entries that are blacklisted or do not represent a word should not be output.
90        const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();
91
92        // Increase output score of top typing suggestion to ensure autocorrection.
93        // TODO: Better integration with java side autocorrection logic.
94        const int finalScore = scoringPolicy->calculateFinalScore(
95                compoundDistance, traverseSession->getInputSize(),
96                terminalDicNode->getContainedErrorTypes(),
97                (forceCommitMultiWords && terminalDicNode->hasMultipleWords())
98                         || (isValidWord && scoringPolicy->doesAutoCorrectValidWord()),
99                boostExactMatches);
100
101        // Don't output invalid words. However, we still need to submit their shortcuts if any.
102        if (isValidWord) {
103            terminalDicNode->outputResult(codePoints);
104            const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
105                    terminalDicNode->getSecondWordFirstInputIndex(
106                            traverseSession->getProximityInfoState(0)) :
107                    NOT_AN_INDEX;
108            outSuggestionResults->addSuggestion(codePoints,
109                    terminalDicNode->getTotalNodeCodePointCount(),
110                    finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags,
111                    indexToPartialCommit, computeFirstWordConfidence(terminalDicNode));
112        }
113
114        if (!terminalDicNode->hasMultipleWords()) {
115            BinaryDictionaryShortcutIterator shortcutIt(
116                    traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(),
117                    traverseSession->getDictionaryStructurePolicy()
118                            ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos()));
119            // Shortcut is not supported for multiple words suggestions.
120            // TODO: Check shortcuts during traversal for multiple words suggestions.
121            const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
122            const int shortcutBaseScore = scoringPolicy->doesAutoCorrectValidWord() ?
123                     scoringPolicy->calculateFinalScore(compoundDistance,
124                             traverseSession->getInputSize(),
125                             terminalDicNode->getContainedErrorTypes(),
126                             true /* forceCommit */, boostExactMatches) : finalScore;
127            outputShortcuts(&shortcutIt, shortcutBaseScore, sameAsTyped, outSuggestionResults);
128        }
129    }
130    scoringPolicy->getMostProbableString(traverseSession, languageWeight, outSuggestionResults);
131}
132
133/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence(
134        const DicNode *const terminalDicNode) {
135    // Get the number of spaces in the first suggestion
136    const int spaceCount = terminalDicNode->getTotalNodeSpaceCount();
137    // Get the number of characters in the first suggestion
138    const int length = terminalDicNode->getTotalNodeCodePointCount();
139    // Get the distance for the first word of the suggestion
140    const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord();
141
142    // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000.
143    // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or
144    // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means
145    // we are very confident.
146    // Expected space count is 1 ~ 5
147    static const int MIN_EXPECTED_SPACE_COUNT = 1;
148    static const int MAX_EXPECTED_SPACE_COUNT = 5;
149    // Expected length is about 4 ~ 30
150    static const int MIN_EXPECTED_LENGTH = 4;
151    static const int MAX_EXPECTED_LENGTH = 30;
152    // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0
153    static const float MIN_EXPECTED_DISTANCE = 0.0;
154    static const float MAX_EXPECTED_DISTANCE = 2.0;
155    // This is not strict: it's where most stuff will be falling, but it's still fine if it's
156    // outside these values. We want to output a value that reflects all of these. Each factor
157    // contributes a bit.
158
159    // We need at least a space.
160    if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE;
161
162    // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0
163    // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the
164    // weight of the distance. Clamp to avoid overflows.
165    const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE
166            : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance;
167    const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT
168            * (MAX_EXPECTED_DISTANCE - clampedDistance)
169            / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE);
170    // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no
171    // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the
172    // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp.
173    const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT
174            * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH);
175    // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no
176    // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the
177    // weight of the space count.
178    const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT
179            * (spaceCount - MIN_EXPECTED_SPACE_COUNT)
180            / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT);
181
182    return distanceContribution + lengthContribution + spaceContribution;
183}
184
185/* static */ void SuggestionsOutputUtils::outputShortcuts(
186        BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore,
187        const bool sameAsTyped, SuggestionResults *const outSuggestionResults) {
188    int shortcutTarget[MAX_WORD_LENGTH];
189    while (shortcutIt->hasNextShortcutTarget()) {
190        bool isWhilelist;
191        int shortcutTargetStringLength;
192        shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget,
193                &shortcutTargetStringLength, &isWhilelist);
194        int shortcutScore;
195        int kind;
196        if (isWhilelist && sameAsTyped) {
197            shortcutScore = S_INT_MAX;
198            kind = Dictionary::KIND_WHITELIST;
199        } else {
200            // shortcut entry's score == its base entry's score - 1
201            shortcutScore = finalScore;
202            // Protection against int underflow
203            shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1;
204            kind = Dictionary::KIND_SHORTCUT;
205        }
206        outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength,
207                std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX,
208                NOT_A_FIRST_WORD_CONFIDENCE);
209    }
210}
211} // namespace latinime
212