dic_node_utils.cpp revision 38c26dd0bf8cd5c4511e4a02d5eeae4b3553f03a
138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* 238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * Copyright (C) 2012 The Android Open Source Project 338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * 438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * Licensed under the Apache License, Version 2.0 (the "License"); 538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * you may not use this file except in compliance with the License. 638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * You may obtain a copy of the License at 738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * 838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * http://www.apache.org/licenses/LICENSE-2.0 938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * 1038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * Unless required by applicable law or agreed to in writing, software 1138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * distributed under the License is distributed on an "AS IS" BASIS, 1238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * See the License for the specific language governing permissions and 1438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * limitations under the License. 1538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka */ 1638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 1738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka#include <cstring> 1838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka#include <vector> 1938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 2038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka#include "binary_format.h" 2138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka#include "dic_node.h" 2238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka#include "dic_node_utils.h" 2338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka#include "dic_node_vector.h" 2438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka#include "proximity_info.h" 2538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka#include "proximity_info_state.h" 2638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 2738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataokanamespace latinime { 2838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 2938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/////////////////////////////// 3038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka// Node initialization utils // 3138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/////////////////////////////// 3238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 3338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ void DicNodeUtils::initAsRoot(const int rootPos, const uint8_t *const dicRoot, 3438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int prevWordNodePos, DicNode *newRootNode) { 3538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int curPos = rootPos; 3638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int pos = curPos; 3738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); 3838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int childrenPos = curPos; 3938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka newRootNode->initAsRoot(pos, childrenPos, childrenCount, prevWordNodePos); 4038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 4138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 4238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/*static */ void DicNodeUtils::initAsRootWithPreviousWord(const int rootPos, 4338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const uint8_t *const dicRoot, DicNode *prevWordLastNode, DicNode *newRootNode) { 4438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int curPos = rootPos; 4538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int pos = curPos; 4638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int childrenCount = BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &curPos); 4738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int childrenPos = curPos; 4838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka newRootNode->initAsRootWithPreviousWord(prevWordLastNode, pos, childrenPos, childrenCount); 4938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 5038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 5138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) { 5238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka destNode->initByCopy(srcNode); 5338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 5438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 5538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/////////////////////////////////// 5638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka// Traverse node expansion utils // 5738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/////////////////////////////////// 5838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 5938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ void DicNodeUtils::createAndGetPassingChildNode(DicNode *dicNode, 6038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const ProximityInfoState *pInfoState, const int pointIndex, const bool exactOnly, 6138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka DicNodeVector *childDicNodes) { 6238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // Passing multiple chars node. No need to traverse child 6338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int codePoint = dicNode->getNodeTypedCodePoint(); 6438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int baseLowerCaseCodePoint = toBaseLowerCase(codePoint); 6538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const bool isMatch = isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, codePoint); 6638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (isMatch || isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) { 6738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka childDicNodes->pushPassingChild(dicNode); 6838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 6938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 7038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 7138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ int DicNodeUtils::createAndGetLeavingChildNode(DicNode *dicNode, int pos, 7238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const uint8_t *const dicRoot, const int terminalDepth, const ProximityInfoState *pInfoState, 7338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int pointIndex, const bool exactOnly, const std::vector<int> *const codePointsFilter, 7438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { 7538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int nextPos = pos; 7638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); 7738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const bool hasMultipleChars = (0 != (BinaryFormat::FLAG_HAS_MULTIPLE_CHARS & flags)); 7838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const bool isTerminal = (0 != (BinaryFormat::FLAG_IS_TERMINAL & flags)); 7938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const bool hasChildren = BinaryFormat::hasChildrenInFlags(flags); 8038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 8138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int codePoint = BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos); 8238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka ASSERT(NOT_A_CODE_POINT != codePoint); 8338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int nodeCodePoint = codePoint; 8438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // TODO: optimize this 8538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int additionalWordBuf[MAX_WORD_LENGTH]; 8638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka uint16_t additionalSubwordLength = 0; 8738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka additionalWordBuf[additionalSubwordLength++] = codePoint; 8838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 8938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka do { 9038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int nextCodePoint = hasMultipleChars 9138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka ? BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos) : NOT_A_CODE_POINT; 9238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const bool isLastChar = (NOT_A_CODE_POINT == nextCodePoint); 9338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (!isLastChar) { 9438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka additionalWordBuf[additionalSubwordLength++] = nextCodePoint; 9538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 9638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka codePoint = nextCodePoint; 9738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } while (NOT_A_CODE_POINT != codePoint); 9838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 9938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int probability = 10038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka isTerminal ? BinaryFormat::readProbabilityWithoutMovingPointer(dicRoot, pos) : -1; 10138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka pos = BinaryFormat::skipProbability(flags, pos); 10238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int childrenPos = hasChildren ? BinaryFormat::readChildrenPosition(dicRoot, flags, pos) : 0; 10338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int attributesPos = BinaryFormat::skipChildrenPosition(flags, pos); 10438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int siblingPos = BinaryFormat::skipChildrenPosAndAttributes(dicRoot, flags, pos); 10538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 10638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (isDicNodeFilteredOut(nodeCodePoint, pInfo, codePointsFilter)) { 10738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return siblingPos; 10838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 10938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (!isMatchedNodeCodePoint(pInfoState, pointIndex, exactOnly, nodeCodePoint)) { 11038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return siblingPos; 11138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 11238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int childrenCount = hasChildren 11338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka ? BinaryFormat::getGroupCountAndForwardPointer(dicRoot, &childrenPos) : 0; 11438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka childDicNodes->pushLeavingChild(dicNode, nextPos, flags, childrenPos, attributesPos, siblingPos, 11538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka nodeCodePoint, childrenCount, probability, -1 /* bigramProbability */, isTerminal, 11638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka hasMultipleChars, hasChildren, additionalSubwordLength, additionalWordBuf); 11738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return siblingPos; 11838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 11938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 12038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ bool DicNodeUtils::isDicNodeFilteredOut(const int nodeCodePoint, 12138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const ProximityInfo *const pInfo, const std::vector<int> *const codePointsFilter) { 12238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int filterSize = codePointsFilter ? codePointsFilter->size() : 0; 12338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (filterSize <= 0) { 12438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return false; 12538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 12638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (pInfo && (pInfo->getKeyIndexOf(nodeCodePoint) == NOT_AN_INDEX 12738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka || isIntentionalOmissionCodePoint(nodeCodePoint))) { 12838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // If normalized nodeCodePoint is not on the keyboard or skippable, this child is never 12938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // filtered. 13038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return false; 13138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 13238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int lowerCodePoint = toLowerCase(nodeCodePoint); 13338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int baseLowerCodePoint = toBaseCodePoint(lowerCodePoint); 13438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // TODO: Avoid linear search 13538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka for (int i = 0; i < filterSize; ++i) { 13638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // Checking if a normalized code point is in filter characters when pInfo is not 13738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // null. When pInfo is null, nodeCodePoint is used to check filtering without 13838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // normalizing. 13938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if ((pInfo && ((*codePointsFilter)[i] == lowerCodePoint 14038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka || (*codePointsFilter)[i] == baseLowerCodePoint)) 14138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka || (!pInfo && (*codePointsFilter)[i] == nodeCodePoint)) { 14238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return false; 14338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 14438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 14538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return true; 14638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 14738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 14838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ void DicNodeUtils::createAndGetAllLeavingChildNodes(DicNode *dicNode, 14938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, 15038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const bool exactOnly, const std::vector<int> *const codePointsFilter, 15138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const ProximityInfo *const pInfo, DicNodeVector *childDicNodes) { 15238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int terminalDepth = dicNode->getLeavingDepth(); 15338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int childCount = dicNode->getChildrenCount(); 15438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int nextPos = dicNode->getChildrenPos(); 15538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka for (int i = 0; i < childCount; i++) { 15638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int filterSize = codePointsFilter ? codePointsFilter->size() : 0; 15738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka nextPos = createAndGetLeavingChildNode(dicNode, nextPos, dicRoot, terminalDepth, pInfoState, 15838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka pointIndex, exactOnly, codePointsFilter, pInfo, childDicNodes); 15938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (!pInfo && filterSize > 0 && childDicNodes->exceeds(filterSize)) { 16038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // All code points have been found. 16138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka break; 16238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 16338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 16438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 16538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 16638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, const uint8_t *const dicRoot, 16738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka DicNodeVector *childDicNodes) { 16838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka getProximityChildDicNodes(dicNode, dicRoot, 0, 0, false, childDicNodes); 16938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 17038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 17138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode, 17238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const uint8_t *const dicRoot, const ProximityInfoState *pInfoState, const int pointIndex, 17338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka bool exactOnly, DicNodeVector *childDicNodes) { 17438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (dicNode->isTotalInputSizeExceedingLimit()) { 17538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return; 17638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 17738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (!dicNode->isLeavingNode()) { 17838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka DicNodeUtils::createAndGetPassingChildNode(dicNode, pInfoState, pointIndex, exactOnly, 17938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka childDicNodes); 18038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } else { 18138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka DicNodeUtils::createAndGetAllLeavingChildNodes(dicNode, dicRoot, pInfoState, pointIndex, 18238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka exactOnly, 0 /* codePointsFilter */, 0 /* pInfo */, 18338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka childDicNodes); 18438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 18538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 18638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 18738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/////////////////// 18838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka// Scoring utils // 18938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/////////////////// 19038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/** 19138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka * Computes the combined bigram / unigram cost for the given dicNode. 19238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka */ 19338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ float DicNodeUtils::getBigramNodeImprobability(const uint8_t *const dicRoot, 19438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const DicNode *const node, hash_map_compat<int, int16_t> *bigramCacheMap) { 19538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (node->isImpossibleBigramWord()) { 19638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); 19738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 19838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int probability = getBigramNodeProbability(dicRoot, node, bigramCacheMap); 19938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. 20038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const float cost = static_cast<float>(MAX_PROBABILITY - probability) 20138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka / static_cast<float>(MAX_PROBABILITY); 20238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return cost; 20338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 20438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 20538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ int DicNodeUtils::getBigramNodeProbability(const uint8_t *const dicRoot, 20638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const DicNode *const node, hash_map_compat<int, int16_t> *bigramCacheMap) { 20738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int unigramProbability = node->getProbability(); 20838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int encodedDiffOfBigramProbability = 20938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka getBigramNodeEncodedDiffProbability(dicRoot, node, bigramCacheMap); 21038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (NOT_A_PROBABILITY == encodedDiffOfBigramProbability) { 21138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return backoff(unigramProbability); 21238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 21338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return BinaryFormat::computeProbabilityForBigram( 21438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka unigramProbability, encodedDiffOfBigramProbability); 21538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 21638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 21738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/////////////////////////////////////// 21838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka// Bigram / Unigram dictionary utils // 21938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/////////////////////////////////////// 22038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 22138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ int16_t DicNodeUtils::getBigramNodeEncodedDiffProbability(const uint8_t *const dicRoot, 22238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const DicNode *const node, hash_map_compat<int, int16_t> *bigramCacheMap) { 22338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int wordPos = node->getPos(); 22438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int prevWordPos = node->getPrevWordPos(); 22538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return getBigramProbability(dicRoot, prevWordPos, wordPos, bigramCacheMap); 22638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 22738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 22838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka// TODO: Move this to BigramDictionary 22938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ int16_t DicNodeUtils::getBigramProbability(const uint8_t *const dicRoot, int pos, 23038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int nextPos, hash_map_compat<int, int16_t> *bigramCacheMap) { 23138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // TODO: this is painfully slow compared to the method used in the previous version of the 23238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // algorithm. Switch to that method. 23338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (NOT_VALID_WORD == pos) return NOT_A_PROBABILITY; 23438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (NOT_VALID_WORD == nextPos) return NOT_A_PROBABILITY; 23538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 23638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // Create a hash code for the given node pair (based on Josh Bloch's effective Java). 23738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // TODO: Use a real hash map data structure that deals with collisions. 23838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int hash = 17; 23938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka hash = hash * 31 + pos; 24038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka hash = hash * 31 + nextPos; 24138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 24238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka hash_map_compat<int, int16_t>::const_iterator mapPos = bigramCacheMap->find(hash); 24338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (mapPos != bigramCacheMap->end()) { 24438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return mapPos->second; 24538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 24638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (NOT_VALID_WORD == pos) { 24738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return NOT_A_PROBABILITY; 24838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 24938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); 25038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (0 == (flags & BinaryFormat::FLAG_HAS_BIGRAMS)) { 25138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return NOT_A_PROBABILITY; 25238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 25338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (0 == (flags & BinaryFormat::FLAG_HAS_MULTIPLE_CHARS)) { 25438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka BinaryFormat::getCodePointAndForwardPointer(dicRoot, &pos); 25538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } else { 25638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka pos = BinaryFormat::skipOtherCharacters(dicRoot, pos); 25738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 25838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka pos = BinaryFormat::skipChildrenPosition(flags, pos); 25938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka pos = BinaryFormat::skipProbability(flags, pos); 26038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka uint8_t bigramFlags; 26138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int count = 0; 26238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka do { 26338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka bigramFlags = BinaryFormat::getFlagsAndForwardPointer(dicRoot, &pos); 26438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(dicRoot, 26538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka bigramFlags, &pos); 26638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (bigramPos == nextPos) { 26738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int16_t probability = BinaryFormat::MASK_ATTRIBUTE_PROBABILITY & bigramFlags; 26838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (static_cast<int>(bigramCacheMap->size()) < MAX_BIGRAM_MAP_SIZE) { 26938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka (*bigramCacheMap)[hash] = probability; 27038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 27138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return probability; 27238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 27338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka count++; 27438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } while ((0 != (BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT & bigramFlags)) 27538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka && count < MAX_BIGRAMS_CONSIDERED_PER_CONTEXT); 27638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (static_cast<int>(bigramCacheMap->size()) < MAX_BIGRAM_MAP_SIZE) { 27738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka // TODO: does this -1 mean NOT_VALID_WORD? 27838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka (*bigramCacheMap)[hash] = -1; 27938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 28038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return NOT_A_PROBABILITY; 28138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 28238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 28338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ int DicNodeUtils::getWordPos(const uint8_t *const dicRoot, const int *word, 28438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int wordLength) { 28538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (!word) { 28638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return NOT_VALID_WORD; 28738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 28838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return BinaryFormat::getTerminalPosition( 28938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka dicRoot, word, wordLength, false /* forceLowerCaseSearch */); 29038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 29138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 29238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ bool DicNodeUtils::isMatchedNodeCodePoint(const ProximityInfoState *pInfoState, 29338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int pointIndex, const bool exactOnly, const int nodeCodePoint) { 29438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (!pInfoState) { 29538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return true; 29638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 29738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (exactOnly) { 29838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return pInfoState->getPrimaryCodePointAt(pointIndex) == nodeCodePoint; 29938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 30038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const ProximityType matchedId = pInfoState->getProximityType(pointIndex, nodeCodePoint, 30138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka true /* checkProximityChars */); 30238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return isProximityChar(matchedId); 30338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 30438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 30538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka//////////////// 30638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka// Char utils // 30738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka//////////////// 30838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka 30938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka// TODO: Move to char_utils? 31038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka/* static */ int DicNodeUtils::appendTwoWords(const int *const src0, const int16_t length0, 31138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka const int *const src1, const int16_t length1, int *dest) { 31238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int actualLength0 = 0; 31338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka for (int i = 0; i < length0; ++i) { 31438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (src0[i] == 0) { 31538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka break; 31638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 31738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka actualLength0 = i + 1; 31838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 31938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka actualLength0 = min(actualLength0, MAX_WORD_LENGTH); 32038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka memcpy(dest, src0, actualLength0 * sizeof(dest[0])); 32138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (!src1 || length1 == 0) { 32238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return actualLength0; 32338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 32438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka int actualLength1 = 0; 32538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka for (int i = 0; i < length1; ++i) { 32638c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka if (src1[i] == 0) { 32738c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka break; 32838c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 32938c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka actualLength1 = i + 1; 33038c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka } 33138c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka actualLength1 = min(actualLength1, MAX_WORD_LENGTH - actualLength0 - 1); 33238c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka memcpy(&dest[actualLength0], src1, actualLength1 * sizeof(dest[0])); 33338c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka return actualLength0 + actualLength1; 33438c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} 33538c26dd0bf8cd5c4511e4a02d5eeae4b3553f03aSatoshi Kataoka} // namespace latinime 336