dic_node_utils.cpp revision b179199830d198473154cfe56b3d712966a16c6f
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "suggest/core/dicnode/dic_node_utils.h"
18
19#include <cstring>
20
21#include "suggest/core/dicnode/dic_node.h"
22#include "suggest/core/dicnode/dic_node_proximity_filter.h"
23#include "suggest/core/dicnode/dic_node_vector.h"
24#include "suggest/core/dictionary/binary_dictionary_info.h"
25#include "suggest/core/dictionary/binary_format.h"
26#include "suggest/core/dictionary/multi_bigram_map.h"
27#include "suggest/core/dictionary/probability_utils.h"
28#include "suggest/core/policy/dictionary_structure_policy.h"
29#include "utils/char_utils.h"
30
31namespace latinime {
32
33///////////////////////////////
34// Node initialization utils //
35///////////////////////////////
36
37/* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo,
38        const int prevWordNodePos, DicNode *const newRootNode) {
39    newRootNode->initAsRoot(binaryDictionaryInfo->getStructurePolicy()->getRootPosition(),
40            prevWordNodePos);
41}
42
43/*static */ void DicNodeUtils::initAsRootWithPreviousWord(
44        const BinaryDictionaryInfo *const binaryDictionaryInfo,
45        DicNode *const prevWordLastNode, DicNode *const newRootNode) {
46    newRootNode->initAsRootWithPreviousWord(
47            prevWordLastNode, binaryDictionaryInfo->getStructurePolicy()->getRootPosition());
48}
49
50/* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) {
51    destNode->initByCopy(srcNode);
52}
53
54///////////////////////////////////
55// Traverse node expansion utils //
56///////////////////////////////////
57
58/* static */ void DicNodeUtils::createAndGetPassingChildNode(DicNode *dicNode,
59        const DicNodeProximityFilter *const childrenFilter,
60        DicNodeVector *childDicNodes) {
61    // Passing multiple chars node. No need to traverse child
62    const int codePoint = dicNode->getNodeTypedCodePoint();
63    const int baseLowerCaseCodePoint = CharUtils::toBaseLowerCase(codePoint);
64    if (!childrenFilter->isFilteredOut(codePoint)
65            || CharUtils::isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) {
66        childDicNodes->pushPassingChild(dicNode);
67    }
68}
69
70/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos,
71        const BinaryDictionaryInfo *const binaryDictionaryInfo,
72        const DicNodeProximityFilter *const childrenFilter,
73        DicNodeVector *childDicNodes) {
74    int nextPos = pos;
75    const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(
76            binaryDictionaryInfo->getDictRoot(), &pos);
77    const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags));
78    const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags));
79    const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags);
80    const bool isBlacklistedOrNotAWord = BinaryFormat::hasBlacklistedOrNotAWordFlag(flags);
81
82    int codePoint = BinaryFormat::getCodePointAndForwardPointer(
83            binaryDictionaryInfo->getDictRoot(), &pos);
84    ASSERT(NOT_A_CODE_POINT != codePoint);
85    // TODO: optimize this
86    int mergedNodeCodePoints[MAX_WORD_LENGTH];
87    uint16_t mergedNodeCodePointCount = 0;
88    mergedNodeCodePoints[mergedNodeCodePointCount++] = codePoint;
89
90    do {
91        const int nextCodePoint = hasMultipleChars
92                ? BinaryFormat::getCodePointAndForwardPointer(
93                        binaryDictionaryInfo->getDictRoot(), &pos) : NOT_A_CODE_POINT;
94        const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint);
95        if (!isLastChar) {
96            mergedNodeCodePoints[mergedNodeCodePointCount++] = nextCodePoint;
97        }
98        codePoint = nextCodePoint;
99    } while (NOT_A_CODE_POINT != codePoint);
100
101    const int probability = isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(
102            binaryDictionaryInfo->getDictRoot(), pos) : NOT_A_PROBABILITY;
103    pos = BinaryFormat::skipProbability(flags, pos);
104    int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(
105            binaryDictionaryInfo->getDictRoot(), flags, pos) : NOT_A_DICT_POS;
106    const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(
107            binaryDictionaryInfo->getDictRoot(), flags, pos);
108
109    if (childrenFilter->isFilteredOut(mergedNodeCodePoints[0])) {
110        return siblingPos;
111    }
112    childDicNodes->pushLeavingChild(dicNode, nextPos, childrenPos, probability, isTerminal,
113            hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, mergedNodeCodePoints);
114    return siblingPos;
115}
116
117/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode,
118        const BinaryDictionaryInfo *const binaryDictionaryInfo,
119        const DicNodeProximityFilter *const childrenFilter, DicNodeVector *childDicNodes) {
120    if (!dicNode->hasChildren()) {
121        return;
122    }
123    int nextPos = dicNode->getChildrenPos();
124    const int childCount = BinaryFormat::getGroupCountAndForwardPointer(
125            binaryDictionaryInfo->getDictRoot(), &nextPos);
126    for (int i = 0; i < childCount; i++) {
127        nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo,
128                childrenFilter, childDicNodes);
129    }
130}
131
132/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode,
133        const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) {
134    getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes);
135}
136
137/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode,
138        const BinaryDictionaryInfo *const binaryDictionaryInfo,
139        const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly,
140        DicNodeVector *childDicNodes) {
141    if (dicNode->isTotalInputSizeExceedingLimit()) {
142        return;
143    }
144    const DicNodeProximityFilter childrenFilter(pInfoState, pointIndex, exactOnly);
145    if (!dicNode->isLeavingNode()) {
146        DicNodeUtils::createAndGetPassingChildNode(dicNode, &childrenFilter, childDicNodes);
147    } else {
148        DicNodeUtils::createAndGetAllLeavingChildNodes(
149                dicNode, binaryDictionaryInfo, &childrenFilter, childDicNodes);
150    }
151}
152
153///////////////////
154// Scoring utils //
155///////////////////
156/**
157 * Computes the combined bigram / unigram cost for the given dicNode.
158 */
159/* static */ float DicNodeUtils::getBigramNodeImprobability(
160        const BinaryDictionaryInfo *const binaryDictionaryInfo,
161        const DicNode *const node, MultiBigramMap *multiBigramMap) {
162    if (node->hasMultipleWords() && !node->isValidMultipleWordSuggestion()) {
163        return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
164    }
165    const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap);
166    // TODO: This equation to calculate the improbability looks unreasonable.  Investigate this.
167    const float cost = static_cast<float>(MAX_PROBABILITY - probability)
168            / static_cast<float>(MAX_PROBABILITY);
169    return cost;
170}
171
172/* static */ int DicNodeUtils::getBigramNodeProbability(
173        const BinaryDictionaryInfo *const binaryDictionaryInfo,
174        const DicNode *const node, MultiBigramMap *multiBigramMap) {
175    const int unigramProbability = node->getProbability();
176    const int wordPos = node->getPos();
177    const int prevWordPos = node->getPrevWordPos();
178    if (NOT_A_VALID_WORD_POS == wordPos || NOT_A_VALID_WORD_POS == prevWordPos) {
179        // Note: Normally wordPos comes from the dictionary and should never equal
180        // NOT_A_VALID_WORD_POS.
181        return ProbabilityUtils::backoff(unigramProbability);
182    }
183    if (multiBigramMap) {
184        return multiBigramMap->getBigramProbability(
185                binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability);
186    }
187    return ProbabilityUtils::backoff(unigramProbability);
188}
189
190////////////////
191// Char utils //
192////////////////
193
194// TODO: Move to char_utils?
195/* static */ int DicNodeUtils::appendTwoWords(const int *const src0, const int16_t length0,
196        const int *const src1, const int16_t length1, int *dest) {
197    int actualLength0 = 0;
198    for (int i = 0; i < length0; ++i) {
199        if (src0[i] == 0) {
200            break;
201        }
202        actualLength0 = i + 1;
203    }
204    actualLength0 = min(actualLength0, MAX_WORD_LENGTH);
205    memcpy(dest, src0, actualLength0 * sizeof(dest[0]));
206    if (!src1 || length1 == 0) {
207        return actualLength0;
208    }
209    int actualLength1 = 0;
210    for (int i = 0; i < length1; ++i) {
211        if (src1[i] == 0) {
212            break;
213        }
214        actualLength1 = i + 1;
215    }
216    actualLength1 = min(actualLength1, MAX_WORD_LENGTH - actualLength0 - 1);
217    memcpy(&dest[actualLength0], src1, actualLength1 * sizeof(dest[0]));
218    return actualLength0 + actualLength1;
219}
220} // namespace latinime
221