dynamic_pt_updating_helper.cpp revision 9069d30043d5182dfd38465ad9bbc11ad73fab7c
1/* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" 18 19#include "suggest/core/dictionary/property/unigram_property.h" 20#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 21#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" 22#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" 23#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" 24#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" 25#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" 26 27namespace latinime { 28 29const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; 30 31bool DynamicPtUpdatingHelper::addUnigramWord( 32 DynamicPtReadingHelper *const readingHelper, 33 const int *const wordCodePoints, const int codePointCount, 34 const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { 35 int parentPos = NOT_A_DICT_POS; 36 while (!readingHelper->isEnd()) { 37 const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); 38 if (!ptNodeParams.isValid()) { 39 break; 40 } 41 const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); 42 if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, 43 wordCodePoints[matchedCodePointCount])) { 44 // The first code point is different from target code point. Skip this node and read 45 // the next sibling node. 46 readingHelper->readNextSiblingNode(ptNodeParams); 47 continue; 48 } 49 // Check following merged node code points. 50 const int nodeCodePointCount = ptNodeParams.getCodePointCount(); 51 for (int j = 1; j < nodeCodePointCount; ++j) { 52 const int nextIndex = matchedCodePointCount + j; 53 if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j, 54 wordCodePoints[matchedCodePointCount + j])) { 55 *outAddedNewUnigram = true; 56 return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, 57 wordCodePoints + matchedCodePointCount, 58 codePointCount - matchedCodePointCount); 59 } 60 } 61 // All characters are matched. 62 if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) { 63 return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); 64 } 65 if (!ptNodeParams.hasChildren()) { 66 *outAddedNewUnigram = true; 67 return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, 68 wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams), 69 codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams)); 70 } 71 // Advance to the children nodes. 72 parentPos = ptNodeParams.getHeadPos(); 73 readingHelper->readChildNode(ptNodeParams); 74 } 75 if (readingHelper->isError()) { 76 // The dictionary is invalid. 77 return false; 78 } 79 int pos = readingHelper->getPosOfLastForwardLinkField(); 80 *outAddedNewUnigram = true; 81 return createAndInsertNodeIntoPtNodeArray(parentPos, 82 wordCodePoints + readingHelper->getPrevTotalCodePointCount(), 83 codePointCount - readingHelper->getPrevTotalCodePointCount(), 84 unigramProperty, &pos); 85} 86 87bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, 88 const int wordPos, const BigramProperty *const bigramProperty, 89 bool *const outAddedNewEntry) { 90 if (prevWordsPtNodePos.empty()) { 91 return false; 92 } 93 ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); 94 int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 95 for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { 96 prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( 97 prevWordsPtNodePos[i]).getTerminalId(); 98 } 99 const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); 100 const int wordId = 101 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); 102 return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, bigramProperty, outAddedNewEntry); 103} 104 105bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, 106 const int wordPos) { 107 if (prevWordsPtNodePos.empty()) { 108 return false; 109 } 110 ASSERT(prevWordsPtNodePos.size() <= MAX_PREV_WORD_COUNT_FOR_N_GRAM); 111 int prevWordTerminalIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 112 for (size_t i = 0; i < prevWordsPtNodePos.size(); ++i) { 113 prevWordTerminalIds[i] = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos( 114 prevWordsPtNodePos[i]).getTerminalId(); 115 } 116 const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); 117 const int wordId = 118 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); 119 return mPtNodeWriter->removeNgramEntry(prevWordIds, wordId); 120} 121 122bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, 123 const int *const targetCodePoints, const int targetCodePointCount, 124 const int shortcutProbability) { 125 const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); 126 return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount, 127 shortcutProbability); 128} 129 130bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, 131 const int *const nodeCodePoints, const int nodeCodePointCount, 132 const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) { 133 const int newPtNodeArrayPos = mBuffer->getTailPosition(); 134 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, 135 newPtNodeArrayPos, forwardLinkFieldPos)) { 136 return false; 137 } 138 return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, 139 unigramProperty); 140} 141 142bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, 143 const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { 144 if (originalPtNodeParams->isTerminal() && !originalPtNodeParams->isDeleted()) { 145 // Overwrites the probability. 146 *outAddedNewUnigram = false; 147 return mPtNodeWriter->updatePtNodeUnigramProperty(originalPtNodeParams, unigramProperty); 148 } else { 149 // Make the node terminal and write the probability. 150 *outAddedNewUnigram = true; 151 const int movedPos = mBuffer->getTailPosition(); 152 int writingPos = movedPos; 153 const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, 154 unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), 155 true /* isTerminal */, originalPtNodeParams->getParentPos(), 156 originalPtNodeParams->getCodePointCount(), originalPtNodeParams->getCodePoints(), 157 unigramProperty->getProbability())); 158 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, 159 unigramProperty, &writingPos)) { 160 return false; 161 } 162 if (!mPtNodeWriter->markPtNodeAsMoved(originalPtNodeParams, movedPos, movedPos)) { 163 return false; 164 } 165 } 166 return true; 167} 168 169bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( 170 const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, 171 const int *const codePoints, const int codePointCount) { 172 const int newPtNodeArrayPos = mBuffer->getTailPosition(); 173 if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { 174 return false; 175 } 176 return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, 177 codePointCount, unigramProperty); 178} 179 180bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( 181 const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, 182 const UnigramProperty *const unigramProperty) { 183 int writingPos = mBuffer->getTailPosition(); 184 if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, 185 1 /* arraySize */, &writingPos)) { 186 return false; 187 } 188 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( 189 unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */, 190 parentPtNodePos, nodeCodePointCount, nodeCodePoints, 191 unigramProperty->getProbability())); 192 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, 193 unigramProperty, &writingPos)) { 194 return false; 195 } 196 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, 197 NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { 198 return false; 199 } 200 return true; 201} 202 203// Returns whether the dictionary updating was succeeded or not. 204bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( 205 const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, 206 const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints, 207 const int newNodeCodePointCount) { 208 // When addsExtraChild is true, split the reallocating PtNode and add new child. 209 // Reallocating PtNode: abcde, newNode: abcxy. 210 // abc (1st, not terminal) __ de (2nd) 211 // \_ xy (extra child, terminal) 212 // Otherwise, this method makes 1st part terminal and write information in unigramProperty. 213 // Reallocating PtNode: abcde, newNode: abc. 214 // abc (1st, terminal) __ de (2nd) 215 const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; 216 const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); 217 int writingPos = firstPartOfReallocatedPtNodePos; 218 // Write the 1st part of the reallocating node. The children position will be updated later 219 // with actual children position. 220 if (addsExtraChild) { 221 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( 222 false /* isNotAWord */, false /* isBlacklisted */, false /* isTerminal */, 223 reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount, 224 reallocatingPtNodeParams->getCodePoints(), NOT_A_PROBABILITY)); 225 if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { 226 return false; 227 } 228 } else { 229 const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( 230 unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), 231 true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), 232 overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(), 233 unigramProperty->getProbability())); 234 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, 235 unigramProperty, &writingPos)) { 236 return false; 237 } 238 } 239 const int actualChildrenPos = writingPos; 240 // Create new children PtNode array. 241 const size_t newPtNodeCount = addsExtraChild ? 2 : 1; 242 if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, 243 newPtNodeCount, &writingPos)) { 244 return false; 245 } 246 // Write the 2nd part of the reallocating node. 247 const int secondPartOfReallocatedPtNodePos = writingPos; 248 const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, 249 reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isBlacklisted(), 250 reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, 251 reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount, 252 reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount, 253 reallocatingPtNodeParams->getProbability())); 254 if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { 255 return false; 256 } 257 if (addsExtraChild) { 258 const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( 259 unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), 260 true /* isTerminal */, firstPartOfReallocatedPtNodePos, 261 newNodeCodePointCount - overlappingCodePointCount, 262 newNodeCodePoints + overlappingCodePointCount, unigramProperty->getProbability())); 263 if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, 264 unigramProperty, &writingPos)) { 265 return false; 266 } 267 } 268 if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, 269 NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { 270 return false; 271 } 272 // Update original reallocating PtNode as moved. 273 if (!mPtNodeWriter->markPtNodeAsMoved(reallocatingPtNodeParams, firstPartOfReallocatedPtNodePos, 274 secondPartOfReallocatedPtNodePos)) { 275 return false; 276 } 277 // Load node info. Information of the 1st part will be fetched. 278 const PtNodeParams ptNodeParams( 279 mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos)); 280 // Update children position. 281 return mPtNodeWriter->updateChildrenPosition(&ptNodeParams, actualChildrenPos); 282} 283 284const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( 285 const PtNodeParams *const originalPtNodeParams, 286 const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos, 287 const int codePointCount, const int *const codePoints, const int probability) const { 288 const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( 289 isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */, 290 false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, 291 CHILDREN_POSITION_FIELD_SIZE); 292 return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints, 293 probability); 294} 295 296const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode( 297 const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, 298 const int parentPos, const int codePointCount, const int *const codePoints, 299 const int probability) const { 300 const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( 301 isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */, 302 false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, 303 CHILDREN_POSITION_FIELD_SIZE); 304 return PtNodeParams(flags, parentPos, codePointCount, codePoints, probability); 305} 306 307} // namespace latinime 308