suggestions_output_utils.cpp revision 5b69472d56eaebecab772012b5a590c5c9ce7511
1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "suggest/core/result/suggestions_output_utils.h" 18 19#include <algorithm> 20#include <vector> 21 22#include "suggest/core/dicnode/dic_node.h" 23#include "suggest/core/dicnode/dic_node_utils.h" 24#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" 25#include "suggest/core/dictionary/error_type_utils.h" 26#include "suggest/core/policy/scoring.h" 27#include "suggest/core/result/suggestion_results.h" 28#include "suggest/core/session/dic_traverse_session.h" 29#include "suggest/core/suggest_options.h" 30 31namespace latinime { 32 33const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; 34 35/* static */ void SuggestionsOutputUtils::outputSuggestions( 36 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, 37 const float languageWeight, SuggestionResults *const outSuggestionResults) { 38#if DEBUG_EVALUATE_MOST_PROBABLE_STRING 39 const int terminalSize = 0; 40#else 41 const int terminalSize = traverseSession->getDicTraverseCache()->terminalSize(); 42#endif 43 std::vector<DicNode> terminals(terminalSize); 44 for (int index = terminalSize - 1; index >= 0; --index) { 45 traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); 46 } 47 // Compute a language weight when an invalid language weight is passed. 48 // NOT_A_LANGUAGE_WEIGHT (-1) is assumed as an invalid language weight. 49 const float languageWeightToOutputSuggestions = (languageWeight < 0.0f) ? 50 scoringPolicy->getAdjustedLanguageWeight( 51 traverseSession, terminals.data(), terminalSize) : languageWeight; 52 outSuggestionResults->setLanguageWeight(languageWeightToOutputSuggestions); 53 // Force autocorrection for obvious long multi-word suggestions when the top suggestion is 54 // a long multiple words suggestion. 55 // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. 56 const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop() 57 && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT 58 && !terminals.empty() && terminals.front().hasMultipleWords()); 59 // TODO: have partial commit work even with multiple pointers. 60 const bool outputSecondWordFirstLetterInputIndex = 61 traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); 62 const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()-> 63 getHeaderStructurePolicy()->shouldBoostExactMatches(); 64 65 // Output suggestion results here 66 for (auto &terminalDicNode : terminals) { 67 outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode, 68 languageWeightToOutputSuggestions, boostExactMatches, forceCommitMultiWords, 69 outputSecondWordFirstLetterInputIndex, outSuggestionResults); 70 } 71 scoringPolicy->getMostProbableString(traverseSession, languageWeightToOutputSuggestions, 72 outSuggestionResults); 73} 74 75/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( 76 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, 77 const DicNode *const terminalDicNode, const float languageWeight, 78 const bool boostExactMatches, const bool forceCommitMultiWords, 79 const bool outputSecondWordFirstLetterInputIndex, 80 SuggestionResults *const outSuggestionResults) { 81 if (DEBUG_GEO_FULL) { 82 terminalDicNode->dump("OUT:"); 83 } 84 const float doubleLetterCost = 85 scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode); 86 const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) 87 + doubleLetterCost; 88 const bool isPossiblyOffensiveWord = 89 traverseSession->getDictionaryStructurePolicy()->getProbability( 90 terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; 91 const bool isExactMatch = 92 ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); 93 const bool isExactMatchWithIntentionalOmission = 94 ErrorTypeUtils::isExactMatchWithIntentionalOmission( 95 terminalDicNode->getContainedErrorTypes()); 96 const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); 97 // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. 98 // (e.g. "AMD" and "and") 99 const bool isSafeExactMatch = isExactMatch 100 && !(isPossiblyOffensiveWord && isFirstCharUppercase); 101 const int outputTypeFlags = 102 (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) 103 | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) 104 | (isExactMatchWithIntentionalOmission ? 105 Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0); 106 107 // Entries that are blacklisted or do not represent a word should not be output. 108 const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); 109 // When we have to block offensive words, non-exact matched offensive words should not be 110 // output. 111 const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords(); 112 const bool isBlockedOffensiveWord = blockOffensiveWords && isPossiblyOffensiveWord 113 && !isSafeExactMatch; 114 115 // Increase output score of top typing suggestion to ensure autocorrection. 116 // TODO: Better integration with java side autocorrection logic. 117 const int finalScore = scoringPolicy->calculateFinalScore( 118 compoundDistance, traverseSession->getInputSize(), 119 terminalDicNode->getContainedErrorTypes(), 120 (forceCommitMultiWords && terminalDicNode->hasMultipleWords()), 121 boostExactMatches); 122 123 // Don't output invalid or blocked offensive words. However, we still need to submit their 124 // shortcuts if any. 125 if (isValidWord && !isBlockedOffensiveWord) { 126 int codePoints[MAX_WORD_LENGTH]; 127 terminalDicNode->outputResult(codePoints); 128 const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? 129 terminalDicNode->getSecondWordFirstInputIndex( 130 traverseSession->getProximityInfoState(0)) : 131 NOT_AN_INDEX; 132 outSuggestionResults->addSuggestion(codePoints, 133 terminalDicNode->getTotalNodeCodePointCount(), 134 finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags, 135 indexToPartialCommit, computeFirstWordConfidence(terminalDicNode)); 136 } 137 138 // Output shortcuts. 139 // Shortcut is not supported for multiple words suggestions. 140 // TODO: Check shortcuts during traversal for multiple words suggestions. 141 if (!terminalDicNode->hasMultipleWords()) { 142 BinaryDictionaryShortcutIterator shortcutIt( 143 traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), 144 traverseSession->getDictionaryStructurePolicy() 145 ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos())); 146 const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode); 147 outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults); 148 } 149} 150 151/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence( 152 const DicNode *const terminalDicNode) { 153 // Get the number of spaces in the first suggestion 154 const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); 155 // Get the number of characters in the first suggestion 156 const int length = terminalDicNode->getTotalNodeCodePointCount(); 157 // Get the distance for the first word of the suggestion 158 const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); 159 160 // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. 161 // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or 162 // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means 163 // we are very confident. 164 // Expected space count is 1 ~ 5 165 static const int MIN_EXPECTED_SPACE_COUNT = 1; 166 static const int MAX_EXPECTED_SPACE_COUNT = 5; 167 // Expected length is about 4 ~ 30 168 static const int MIN_EXPECTED_LENGTH = 4; 169 static const int MAX_EXPECTED_LENGTH = 30; 170 // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 171 static const float MIN_EXPECTED_DISTANCE = 0.0; 172 static const float MAX_EXPECTED_DISTANCE = 2.0; 173 // This is not strict: it's where most stuff will be falling, but it's still fine if it's 174 // outside these values. We want to output a value that reflects all of these. Each factor 175 // contributes a bit. 176 177 // We need at least a space. 178 if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; 179 180 // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 181 // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the 182 // weight of the distance. Clamp to avoid overflows. 183 const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE 184 : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; 185 const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT 186 * (MAX_EXPECTED_DISTANCE - clampedDistance) 187 / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); 188 // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no 189 // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the 190 // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. 191 const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT 192 * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); 193 // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no 194 // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the 195 // weight of the space count. 196 const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT 197 * (spaceCount - MIN_EXPECTED_SPACE_COUNT) 198 / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); 199 200 return distanceContribution + lengthContribution + spaceContribution; 201} 202 203/* static */ void SuggestionsOutputUtils::outputShortcuts( 204 BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore, 205 const bool sameAsTyped, SuggestionResults *const outSuggestionResults) { 206 int shortcutTarget[MAX_WORD_LENGTH]; 207 while (shortcutIt->hasNextShortcutTarget()) { 208 bool isWhilelist; 209 int shortcutTargetStringLength; 210 shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, 211 &shortcutTargetStringLength, &isWhilelist); 212 int shortcutScore; 213 int kind; 214 if (isWhilelist && sameAsTyped) { 215 shortcutScore = S_INT_MAX; 216 kind = Dictionary::KIND_WHITELIST; 217 } else { 218 // shortcut entry's score == its base entry's score - 1 219 shortcutScore = finalScore; 220 // Protection against int underflow 221 shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1; 222 kind = Dictionary::KIND_SHORTCUT; 223 } 224 outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength, 225 std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX, 226 NOT_A_FIRST_WORD_CONFIDENCE); 227 } 228} 229} // namespace latinime 230