dic_node_utils.cpp revision b179199830d198473154cfe56b3d712966a16c6f
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "suggest/core/dicnode/dic_node_utils.h" 18 19#include <cstring> 20 21#include "suggest/core/dicnode/dic_node.h" 22#include "suggest/core/dicnode/dic_node_proximity_filter.h" 23#include "suggest/core/dicnode/dic_node_vector.h" 24#include "suggest/core/dictionary/binary_dictionary_info.h" 25#include "suggest/core/dictionary/binary_format.h" 26#include "suggest/core/dictionary/multi_bigram_map.h" 27#include "suggest/core/dictionary/probability_utils.h" 28#include "suggest/core/policy/dictionary_structure_policy.h" 29#include "utils/char_utils.h" 30 31namespace latinime { 32 33/////////////////////////////// 34// Node initialization utils // 35/////////////////////////////// 36 37/* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo, 38 const int prevWordNodePos, DicNode *const newRootNode) { 39 newRootNode->initAsRoot(binaryDictionaryInfo->getStructurePolicy()->getRootPosition(), 40 prevWordNodePos); 41} 42 43/*static */ void DicNodeUtils::initAsRootWithPreviousWord( 44 const BinaryDictionaryInfo *const binaryDictionaryInfo, 45 DicNode *const prevWordLastNode, DicNode *const newRootNode) { 46 newRootNode->initAsRootWithPreviousWord( 47 prevWordLastNode, binaryDictionaryInfo->getStructurePolicy()->getRootPosition()); 48} 49 50/* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) { 51 destNode->initByCopy(srcNode); 52} 53 54/////////////////////////////////// 55// Traverse node expansion utils // 56/////////////////////////////////// 57 58/* static */ void DicNodeUtils::createAndGetPassingChildNode(DicNode *dicNode, 59 const DicNodeProximityFilter *const childrenFilter, 60 DicNodeVector *childDicNodes) { 61 // Passing multiple chars node. No need to traverse child 62 const int codePoint = dicNode->getNodeTypedCodePoint(); 63 const int baseLowerCaseCodePoint = CharUtils::toBaseLowerCase(codePoint); 64 if (!childrenFilter->isFilteredOut(codePoint) 65 || CharUtils::isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) { 66 childDicNodes->pushPassingChild(dicNode); 67 } 68} 69 70/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos, 71 const BinaryDictionaryInfo *const binaryDictionaryInfo, 72 const DicNodeProximityFilter *const childrenFilter, 73 DicNodeVector *childDicNodes) { 74 int nextPos = pos; 75 const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer( 76 binaryDictionaryInfo->getDictRoot(), &pos); 77 const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); 78 const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); 79 const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); 80 const bool isBlacklistedOrNotAWord = BinaryFormat::hasBlacklistedOrNotAWordFlag(flags); 81 82 int codePoint = BinaryFormat::getCodePointAndForwardPointer( 83 binaryDictionaryInfo->getDictRoot(), &pos); 84 ASSERT(NOT_A_CODE_POINT != codePoint); 85 // TODO: optimize this 86 int mergedNodeCodePoints[MAX_WORD_LENGTH]; 87 uint16_t mergedNodeCodePointCount = 0; 88 mergedNodeCodePoints[mergedNodeCodePointCount++] = codePoint; 89 90 do { 91 const int nextCodePoint = hasMultipleChars 92 ? BinaryFormat::getCodePointAndForwardPointer( 93 binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT; 94 const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint); 95 if (!isLastChar) { 96 mergedNodeCodePoints[mergedNodeCodePointCount++] = nextCodePoint; 97 } 98 codePoint = nextCodePoint; 99 } while (NOT_A_CODE_POINT != codePoint); 100 101 const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer( 102 binaryDictionaryInfo->getDictRoot(), pos) : NOT_A_PROBABILITY; 103 pos = BinaryFormat::skipProbability(flags, pos); 104 int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition( 105 binaryDictionaryInfo->getDictRoot(), flags, pos) : NOT_A_DICT_POS; 106 const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes( 107 binaryDictionaryInfo->getDictRoot(), flags, pos); 108 109 if (childrenFilter->isFilteredOut(mergedNodeCodePoints[0])) { 110 return siblingPos; 111 } 112 childDicNodes->pushLeavingChild(dicNode, nextPos, childrenPos, probability, isTerminal, 113 hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, mergedNodeCodePoints); 114 return siblingPos; 115} 116 117/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode, 118 const BinaryDictionaryInfo *const binaryDictionaryInfo, 119 const DicNodeProximityFilter *const childrenFilter, DicNodeVector *childDicNodes) { 120 if (!dicNode->hasChildren()) { 121 return; 122 } 123 int nextPos = dicNode->getChildrenPos(); 124 const int childCount = BinaryFormat::getGroupCountAndForwardPointer( 125 binaryDictionaryInfo->getDictRoot(), &nextPos); 126 for (int i = 0; i < childCount; i++) { 127 nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo, 128 childrenFilter, childDicNodes); 129 } 130} 131 132/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, 133 const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) { 134 getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes); 135} 136 137/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode, 138 const BinaryDictionaryInfo *const binaryDictionaryInfo, 139 const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, 140 DicNodeVector *childDicNodes) { 141 if (dicNode->isTotalInputSizeExceedingLimit()) { 142 return; 143 } 144 const DicNodeProximityFilter childrenFilter(pInfoState, pointIndex, exactOnly); 145 if (!dicNode->isLeavingNode()) { 146 DicNodeUtils::createAndGetPassingChildNode(dicNode, &childrenFilter, childDicNodes); 147 } else { 148 DicNodeUtils::createAndGetAllLeavingChildNodes( 149 dicNode, binaryDictionaryInfo, &childrenFilter, childDicNodes); 150 } 151} 152 153/////////////////// 154// Scoring utils // 155/////////////////// 156/** 157 * Computes the combined bigram / unigram cost for the given dicNode. 158 */ 159/* static */ float DicNodeUtils::getBigramNodeImprobability( 160 const BinaryDictionaryInfo *const binaryDictionaryInfo, 161 const DicNode *const node, MultiBigramMap *multiBigramMap) { 162 if (node->hasMultipleWords() && !node->isValidMultipleWordSuggestion()) { 163 return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); 164 } 165 const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap); 166 // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. 167 const float cost = static_cast<float>(MAX_PROBABILITY - probability) 168 / static_cast<float>(MAX_PROBABILITY); 169 return cost; 170} 171 172/* static */ int DicNodeUtils::getBigramNodeProbability( 173 const BinaryDictionaryInfo *const binaryDictionaryInfo, 174 const DicNode *const node, MultiBigramMap *multiBigramMap) { 175 const int unigramProbability = node->getProbability(); 176 const int wordPos = node->getPos(); 177 const int prevWordPos = node->getPrevWordPos(); 178 if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) { 179 // Note: Normally wordPos comes from the dictionary and should never equal 180 // NOT_A_VALID_WORD_POS. 181 return ProbabilityUtils::backoff(unigramProbability); 182 } 183 if (multiBigramMap) { 184 return multiBigramMap->getBigramProbability( 185 binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability); 186 } 187 return ProbabilityUtils::backoff(unigramProbability); 188} 189 190//////////////// 191// Char utils // 192//////////////// 193 194// TODO: Move to char_utils? 195/* static */ int DicNodeUtils::appendTwoWords(const int *const src0, const int16_t length0, 196 const int *const src1, const int16_t length1, int *dest) { 197 int actualLength0 = 0; 198 for (int i = 0; i < length0; ++i) { 199 if (src0[i] == 0) { 200 break; 201 } 202 actualLength0 = i + 1; 203 } 204 actualLength0 = min(actualLength0, MAX_WORD_LENGTH); 205 memcpy(dest, src0, actualLength0 * sizeof(dest[0])); 206 if (!src1 || length1 == 0) { 207 return actualLength0; 208 } 209 int actualLength1 = 0; 210 for (int i = 0; i < length1; ++i) { 211 if (src1[i] == 0) { 212 break; 213 } 214 actualLength1 = i + 1; 215 } 216 actualLength1 = min(actualLength1, MAX_WORD_LENGTH - actualLength0 - 1); 217 memcpy(&dest[actualLength0], src1, actualLength1 * sizeof(dest[0])); 218 return actualLength0 + actualLength1; 219} 220} // namespace latinime 221