suggestions_output_utils.cpp revision ff1b3947c6c578c8073902d0834600bcbdd45763
1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "suggest/core/result/suggestions_output_utils.h" 18 19#include <algorithm> 20#include <vector> 21 22#include "suggest/core/dicnode/dic_node.h" 23#include "suggest/core/dicnode/dic_node_utils.h" 24#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" 25#include "suggest/core/dictionary/error_type_utils.h" 26#include "suggest/core/policy/scoring.h" 27#include "suggest/core/result/suggestion_results.h" 28#include "suggest/core/session/dic_traverse_session.h" 29 30namespace latinime { 31 32const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; 33 34/* static */ void SuggestionsOutputUtils::outputSuggestions( 35 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, 36 SuggestionResults *const outSuggestionResults) { 37#if DEBUG_EVALUATE_MOST_PROBABLE_STRING 38 const int terminalSize = 0; 39#else 40 const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize(); 41#endif 42 std::vector<DicNode> terminals(terminalSize); 43 for (int index = terminalSize - 1; index >= 0; --index) { 44 traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); 45 } 46 47 const float languageWeight = scoringPolicy->getAdjustedLanguageWeight( 48 traverseSession, terminals.data(), terminalSize); 49 // Force autocorrection for obvious long multi-word suggestions when the top suggestion is 50 // a long multiple words suggestion. 51 // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. 52 const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop() 53 && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT 54 && !terminals.empty() && terminals.front().hasMultipleWords()); 55 // TODO: have partial commit work even with multiple pointers. 56 const bool outputSecondWordFirstLetterInputIndex = 57 traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); 58 const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()-> 59 getHeaderStructurePolicy()->shouldBoostExactMatches(); 60 61 // Output suggestion results here 62 for (auto &terminalDicNode : terminals) { 63 outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode, 64 languageWeight, boostExactMatches, forceCommitMultiWords, 65 outputSecondWordFirstLetterInputIndex, outSuggestionResults); 66 } 67 scoringPolicy->getMostProbableString(traverseSession, languageWeight, outSuggestionResults); 68} 69 70/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( 71 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, 72 const DicNode *const terminalDicNode, const float languageWeight, 73 const bool boostExactMatches, const bool forceCommitMultiWords, 74 const bool outputSecondWordFirstLetterInputIndex, 75 SuggestionResults *const outSuggestionResults) { 76 if (DEBUG_GEO_FULL) { 77 terminalDicNode->dump("OUT:"); 78 } 79 const float doubleLetterCost = 80 scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode); 81 const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) 82 + doubleLetterCost; 83 const bool isPossiblyOffensiveWord = 84 traverseSession->getDictionaryStructurePolicy()->getProbability( 85 terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; 86 const bool isExactMatch = 87 ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); 88 const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); 89 // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. 90 // (e.g. "AMD" and "and") 91 const bool isSafeExactMatch = isExactMatch 92 && !(isPossiblyOffensiveWord && isFirstCharUppercase); 93 const int outputTypeFlags = 94 (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) 95 | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); 96 97 // Entries that are blacklisted or do not represent a word should not be output. 98 const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); 99 100 // Increase output score of top typing suggestion to ensure autocorrection. 101 // TODO: Better integration with java side autocorrection logic. 102 const int finalScore = scoringPolicy->calculateFinalScore( 103 compoundDistance, traverseSession->getInputSize(), 104 terminalDicNode->getContainedErrorTypes(), 105 (forceCommitMultiWords && terminalDicNode->hasMultipleWords()) 106 || (isValidWord && scoringPolicy->doesAutoCorrectValidWord()), 107 boostExactMatches); 108 109 // Don't output invalid words. However, we still need to submit their shortcuts if any. 110 if (isValidWord) { 111 int codePoints[MAX_WORD_LENGTH]; 112 terminalDicNode->outputResult(codePoints); 113 const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? 114 terminalDicNode->getSecondWordFirstInputIndex( 115 traverseSession->getProximityInfoState(0)) : 116 NOT_AN_INDEX; 117 outSuggestionResults->addSuggestion(codePoints, 118 terminalDicNode->getTotalNodeCodePointCount(), 119 finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags, 120 indexToPartialCommit, computeFirstWordConfidence(terminalDicNode)); 121 } 122 123 // Output shortcuts. 124 // Shortcut is not supported for multiple words suggestions. 125 // TODO: Check shortcuts during traversal for multiple words suggestions. 126 if (!terminalDicNode->hasMultipleWords()) { 127 BinaryDictionaryShortcutIterator shortcutIt( 128 traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), 129 traverseSession->getDictionaryStructurePolicy() 130 ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos())); 131 const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode); 132 const int shortcutBaseScore = scoringPolicy->doesAutoCorrectValidWord() ? 133 scoringPolicy->calculateFinalScore(compoundDistance, 134 traverseSession->getInputSize(), 135 terminalDicNode->getContainedErrorTypes(), 136 true /* forceCommit */, boostExactMatches) : finalScore; 137 outputShortcuts(&shortcutIt, shortcutBaseScore, sameAsTyped, outSuggestionResults); 138 } 139} 140 141/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence( 142 const DicNode *const terminalDicNode) { 143 // Get the number of spaces in the first suggestion 144 const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); 145 // Get the number of characters in the first suggestion 146 const int length = terminalDicNode->getTotalNodeCodePointCount(); 147 // Get the distance for the first word of the suggestion 148 const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); 149 150 // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. 151 // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or 152 // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means 153 // we are very confident. 154 // Expected space count is 1 ~ 5 155 static const int MIN_EXPECTED_SPACE_COUNT = 1; 156 static const int MAX_EXPECTED_SPACE_COUNT = 5; 157 // Expected length is about 4 ~ 30 158 static const int MIN_EXPECTED_LENGTH = 4; 159 static const int MAX_EXPECTED_LENGTH = 30; 160 // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 161 static const float MIN_EXPECTED_DISTANCE = 0.0; 162 static const float MAX_EXPECTED_DISTANCE = 2.0; 163 // This is not strict: it's where most stuff will be falling, but it's still fine if it's 164 // outside these values. We want to output a value that reflects all of these. Each factor 165 // contributes a bit. 166 167 // We need at least a space. 168 if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; 169 170 // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 171 // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the 172 // weight of the distance. Clamp to avoid overflows. 173 const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE 174 : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; 175 const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT 176 * (MAX_EXPECTED_DISTANCE - clampedDistance) 177 / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); 178 // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no 179 // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the 180 // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. 181 const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT 182 * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); 183 // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no 184 // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the 185 // weight of the space count. 186 const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT 187 * (spaceCount - MIN_EXPECTED_SPACE_COUNT) 188 / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); 189 190 return distanceContribution + lengthContribution + spaceContribution; 191} 192 193/* static */ void SuggestionsOutputUtils::outputShortcuts( 194 BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore, 195 const bool sameAsTyped, SuggestionResults *const outSuggestionResults) { 196 int shortcutTarget[MAX_WORD_LENGTH]; 197 while (shortcutIt->hasNextShortcutTarget()) { 198 bool isWhilelist; 199 int shortcutTargetStringLength; 200 shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, 201 &shortcutTargetStringLength, &isWhilelist); 202 int shortcutScore; 203 int kind; 204 if (isWhilelist && sameAsTyped) { 205 shortcutScore = S_INT_MAX; 206 kind = Dictionary::KIND_WHITELIST; 207 } else { 208 // shortcut entry's score == its base entry's score - 1 209 shortcutScore = finalScore; 210 // Protection against int underflow 211 shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1; 212 kind = Dictionary::KIND_SHORTCUT; 213 } 214 outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength, 215 std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX, 216 NOT_A_FIRST_WORD_CONFIDENCE); 217 } 218} 219} // namespace latinime 220