suggestions_output_utils.cpp revision a0ea92f76c2e73e4b4bbb59680bc6cd3c33f0573
1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "suggest/core/result/suggestions_output_utils.h" 18 19#include <algorithm> 20 21#include "suggest/core/dicnode/dic_node.h" 22#include "suggest/core/dicnode/dic_node_utils.h" 23#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" 24#include "suggest/core/dictionary/dictionary.h" 25#include "suggest/core/dictionary/error_type_utils.h" 26#include "suggest/core/policy/scoring.h" 27#include "suggest/core/result/suggestion_results.h" 28#include "suggest/core/session/dic_traverse_session.h" 29 30namespace latinime { 31 32const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; 33 34// TODO: Split this method. 35/* static */ void SuggestionsOutputUtils::outputSuggestions( 36 const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, 37 SuggestionResults *const outSuggestionResults) { 38#if DEBUG_EVALUATE_MOST_PROBABLE_STRING 39 const int terminalSize = 0; 40#else 41 const int terminalSize = std::min(MAX_RESULTS, 42 static_cast<int>(traverseSession->getDicTraverseCache()->terminalSize())); 43#endif 44 DicNode terminals[MAX_RESULTS]; // Avoiding non-POD variable length array 45 46 for (int index = terminalSize - 1; index >= 0; --index) { 47 traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); 48 } 49 50 const float languageWeight = scoringPolicy->getAdjustedLanguageWeight( 51 traverseSession, terminals, terminalSize); 52 // Force autocorrection for obvious long multi-word suggestions when the top suggestion is 53 // a long multiple words suggestion. 54 // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. 55 const bool forceCommitMultiWords = scoringPolicy->autoCorrectsToMultiWordSuggestionIfTop() 56 && (traverseSession->getInputSize() >= MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT 57 && terminals[0].hasMultipleWords()); 58 // TODO: have partial commit work even with multiple pointers. 59 const bool outputSecondWordFirstLetterInputIndex = 60 traverseSession->isOnlyOnePointerUsed(0 /* pointerId */); 61 const bool boostExactMatches = traverseSession->getDictionaryStructurePolicy()-> 62 getHeaderStructurePolicy()->shouldBoostExactMatches(); 63 64 int codePoints[MAX_WORD_LENGTH]; 65 // Output suggestion results here 66 for (int terminalIndex = 0; terminalIndex < terminalSize; ++terminalIndex) { 67 DicNode *terminalDicNode = &terminals[terminalIndex]; 68 if (DEBUG_GEO_FULL) { 69 terminalDicNode->dump("OUT:"); 70 } 71 const float doubleLetterCost = 72 scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode); 73 const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) 74 + doubleLetterCost; 75 const bool isPossiblyOffensiveWord = 76 traverseSession->getDictionaryStructurePolicy()->getProbability( 77 terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; 78 const bool isExactMatch = 79 ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); 80 const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); 81 // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. 82 // (e.g. "AMD" and "and") 83 const bool isSafeExactMatch = isExactMatch 84 && !(isPossiblyOffensiveWord && isFirstCharUppercase); 85 const int outputTypeFlags = 86 (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) 87 | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0); 88 89 // Entries that are blacklisted or do not represent a word should not be output. 90 const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); 91 92 // Increase output score of top typing suggestion to ensure autocorrection. 93 // TODO: Better integration with java side autocorrection logic. 94 const int finalScore = scoringPolicy->calculateFinalScore( 95 compoundDistance, traverseSession->getInputSize(), 96 terminalDicNode->getContainedErrorTypes(), 97 (forceCommitMultiWords && terminalDicNode->hasMultipleWords()) 98 || (isValidWord && scoringPolicy->doesAutoCorrectValidWord()), 99 boostExactMatches); 100 101 // Don't output invalid words. However, we still need to submit their shortcuts if any. 102 if (isValidWord) { 103 terminalDicNode->outputResult(codePoints); 104 const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? 105 terminalDicNode->getSecondWordFirstInputIndex( 106 traverseSession->getProximityInfoState(0)) : 107 NOT_AN_INDEX; 108 outSuggestionResults->addSuggestion(codePoints, 109 terminalDicNode->getTotalNodeCodePointCount(), 110 finalScore, Dictionary::KIND_CORRECTION | outputTypeFlags, 111 indexToPartialCommit, computeFirstWordConfidence(terminalDicNode)); 112 } 113 114 if (!terminalDicNode->hasMultipleWords()) { 115 BinaryDictionaryShortcutIterator shortcutIt( 116 traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), 117 traverseSession->getDictionaryStructurePolicy() 118 ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos())); 119 // Shortcut is not supported for multiple words suggestions. 120 // TODO: Check shortcuts during traversal for multiple words suggestions. 121 const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode); 122 const int shortcutBaseScore = scoringPolicy->doesAutoCorrectValidWord() ? 123 scoringPolicy->calculateFinalScore(compoundDistance, 124 traverseSession->getInputSize(), 125 terminalDicNode->getContainedErrorTypes(), 126 true /* forceCommit */, boostExactMatches) : finalScore; 127 outputShortcuts(&shortcutIt, shortcutBaseScore, sameAsTyped, outSuggestionResults); 128 } 129 } 130 scoringPolicy->getMostProbableString(traverseSession, languageWeight, outSuggestionResults); 131} 132 133/* static */ int SuggestionsOutputUtils::computeFirstWordConfidence( 134 const DicNode *const terminalDicNode) { 135 // Get the number of spaces in the first suggestion 136 const int spaceCount = terminalDicNode->getTotalNodeSpaceCount(); 137 // Get the number of characters in the first suggestion 138 const int length = terminalDicNode->getTotalNodeCodePointCount(); 139 // Get the distance for the first word of the suggestion 140 const float distance = terminalDicNode->getNormalizedCompoundDistanceAfterFirstWord(); 141 142 // Arbitrarily, we give a score whose useful values range from 0 to 1,000,000. 143 // 1,000,000 will be the cutoff to auto-commit. It's fine if the number is under 0 or 144 // above 1,000,000 : under 0 just means it's very bad to commit, and above 1,000,000 means 145 // we are very confident. 146 // Expected space count is 1 ~ 5 147 static const int MIN_EXPECTED_SPACE_COUNT = 1; 148 static const int MAX_EXPECTED_SPACE_COUNT = 5; 149 // Expected length is about 4 ~ 30 150 static const int MIN_EXPECTED_LENGTH = 4; 151 static const int MAX_EXPECTED_LENGTH = 30; 152 // Expected distance is about 0.2 ~ 2.0, but consider 0.0 ~ 2.0 153 static const float MIN_EXPECTED_DISTANCE = 0.0; 154 static const float MAX_EXPECTED_DISTANCE = 2.0; 155 // This is not strict: it's where most stuff will be falling, but it's still fine if it's 156 // outside these values. We want to output a value that reflects all of these. Each factor 157 // contributes a bit. 158 159 // We need at least a space. 160 if (spaceCount < 1) return NOT_A_FIRST_WORD_CONFIDENCE; 161 162 // The smaller the edit distance, the higher the contribution. MIN_EXPECTED_DISTANCE means 0 163 // contribution, while MAX_EXPECTED_DISTANCE means full contribution according to the 164 // weight of the distance. Clamp to avoid overflows. 165 const float clampedDistance = distance < MIN_EXPECTED_DISTANCE ? MIN_EXPECTED_DISTANCE 166 : distance > MAX_EXPECTED_DISTANCE ? MAX_EXPECTED_DISTANCE : distance; 167 const int distanceContribution = DISTANCE_WEIGHT_FOR_AUTO_COMMIT 168 * (MAX_EXPECTED_DISTANCE - clampedDistance) 169 / (MAX_EXPECTED_DISTANCE - MIN_EXPECTED_DISTANCE); 170 // The larger the suggestion length, the larger the contribution. MIN_EXPECTED_LENGTH is no 171 // contribution, MAX_EXPECTED_LENGTH is full contribution according to the weight of the 172 // length. Length is guaranteed to be between 1 and 48, so we don't need to clamp. 173 const int lengthContribution = LENGTH_WEIGHT_FOR_AUTO_COMMIT 174 * (length - MIN_EXPECTED_LENGTH) / (MAX_EXPECTED_LENGTH - MIN_EXPECTED_LENGTH); 175 // The more spaces, the larger the contribution. MIN_EXPECTED_SPACE_COUNT space is no 176 // contribution, MAX_EXPECTED_SPACE_COUNT spaces is full contribution according to the 177 // weight of the space count. 178 const int spaceContribution = SPACE_COUNT_WEIGHT_FOR_AUTO_COMMIT 179 * (spaceCount - MIN_EXPECTED_SPACE_COUNT) 180 / (MAX_EXPECTED_SPACE_COUNT - MIN_EXPECTED_SPACE_COUNT); 181 182 return distanceContribution + lengthContribution + spaceContribution; 183} 184 185/* static */ void SuggestionsOutputUtils::outputShortcuts( 186 BinaryDictionaryShortcutIterator *const shortcutIt, const int finalScore, 187 const bool sameAsTyped, SuggestionResults *const outSuggestionResults) { 188 int shortcutTarget[MAX_WORD_LENGTH]; 189 while (shortcutIt->hasNextShortcutTarget()) { 190 bool isWhilelist; 191 int shortcutTargetStringLength; 192 shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, 193 &shortcutTargetStringLength, &isWhilelist); 194 int shortcutScore; 195 int kind; 196 if (isWhilelist && sameAsTyped) { 197 shortcutScore = S_INT_MAX; 198 kind = Dictionary::KIND_WHITELIST; 199 } else { 200 // shortcut entry's score == its base entry's score - 1 201 shortcutScore = finalScore; 202 // Protection against int underflow 203 shortcutScore = std::max(S_INT_MIN + 1, shortcutScore) - 1; 204 kind = Dictionary::KIND_SHORTCUT; 205 } 206 outSuggestionResults->addSuggestion(shortcutTarget, shortcutTargetStringLength, 207 std::max(S_INT_MIN + 1, shortcutScore) - 1, kind, NOT_AN_INDEX, 208 NOT_A_FIRST_WORD_CONFIDENCE); 209 } 210} 211} // namespace latinime 212