ver4_patricia_trie_node_writer.cpp revision 851e0458fe460526b1f953e39a1e406a21ab4647
1/* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" 18 19#include "suggest/core/dictionary/property/unigram_property.h" 20#include "suggest/policyimpl/dictionary/header/header_policy.h" 21#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" 22#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" 23#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" 24#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" 25#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" 26#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" 27#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" 28#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" 29#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" 30#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" 31 32namespace latinime { 33 34const int Ver4PatriciaTrieNodeWriter::CHILDREN_POSITION_FIELD_SIZE = 3; 35 36bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( 37 const PtNodeParams *const toBeUpdatedPtNodeParams) { 38 int pos = toBeUpdatedPtNodeParams->getHeadPos(); 39 const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); 40 const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); 41 if (usesAdditionalBuffer) { 42 pos -= mTrieBuffer->getOriginalBufferSize(); 43 } 44 // Read original flags 45 const PatriciaTrieReadingUtils::NodeFlags originalFlags = 46 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); 47 const PatriciaTrieReadingUtils::NodeFlags updatedFlags = 48 DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, 49 true /* isDeleted */, false /* willBecomeNonTerminal */); 50 int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); 51 // Update flags. 52 if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, 53 &writingPos)) { 54 return false; 55 } 56 if (toBeUpdatedPtNodeParams->isTerminal()) { 57 // The PtNode is a terminal. Delete entry from the terminal position lookup table. 58 return mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( 59 toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */); 60 } else { 61 return true; 62 } 63} 64 65bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( 66 const PtNodeParams *const toBeUpdatedPtNodeParams, 67 const int movedPos, const int bigramLinkedNodePos) { 68 int pos = toBeUpdatedPtNodeParams->getHeadPos(); 69 const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); 70 const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); 71 if (usesAdditionalBuffer) { 72 pos -= mTrieBuffer->getOriginalBufferSize(); 73 } 74 // Read original flags 75 const PatriciaTrieReadingUtils::NodeFlags originalFlags = 76 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); 77 const PatriciaTrieReadingUtils::NodeFlags updatedFlags = 78 DynamicPtReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, 79 false /* isDeleted */, false /* willBecomeNonTerminal */); 80 int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); 81 // Update flags. 82 if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, 83 &writingPos)) { 84 return false; 85 } 86 // Update moved position, which is stored in the parent offset field. 87 if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( 88 mTrieBuffer, movedPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { 89 return false; 90 } 91 if (toBeUpdatedPtNodeParams->hasChildren()) { 92 // Update children's parent position. 93 mReadingHelper.initWithPtNodeArrayPos(toBeUpdatedPtNodeParams->getChildrenPos()); 94 while (!mReadingHelper.isEnd()) { 95 const PtNodeParams childPtNodeParams(mReadingHelper.getPtNodeParams()); 96 int parentOffsetFieldPos = childPtNodeParams.getHeadPos() 97 + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; 98 if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition( 99 mTrieBuffer, bigramLinkedNodePos, childPtNodeParams.getHeadPos(), 100 &parentOffsetFieldPos)) { 101 // Parent offset cannot be written because of a bug or a broken dictionary; thus, 102 // we give up to update dictionary. 103 return false; 104 } 105 mReadingHelper.readNextSiblingNode(childPtNodeParams); 106 } 107 } 108 return true; 109} 110 111bool Ver4PatriciaTrieNodeWriter::markPtNodeAsWillBecomeNonTerminal( 112 const PtNodeParams *const toBeUpdatedPtNodeParams) { 113 int pos = toBeUpdatedPtNodeParams->getHeadPos(); 114 const bool usesAdditionalBuffer = mTrieBuffer->isInAdditionalBuffer(pos); 115 const uint8_t *const dictBuf = mTrieBuffer->getBuffer(usesAdditionalBuffer); 116 if (usesAdditionalBuffer) { 117 pos -= mTrieBuffer->getOriginalBufferSize(); 118 } 119 // Read original flags 120 const PatriciaTrieReadingUtils::NodeFlags originalFlags = 121 PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); 122 const PatriciaTrieReadingUtils::NodeFlags updatedFlags = 123 DynamicPtReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, 124 false /* isDeleted */, true /* willBecomeNonTerminal */); 125 if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( 126 toBeUpdatedPtNodeParams->getTerminalId(), NOT_A_DICT_POS /* ptNodePos */)) { 127 AKLOGE("Cannot update terminal position lookup table. terminal id: %d", 128 toBeUpdatedPtNodeParams->getTerminalId()); 129 return false; 130 } 131 // Update flags. 132 int writingPos = toBeUpdatedPtNodeParams->getHeadPos(); 133 return DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, updatedFlags, 134 &writingPos); 135} 136 137bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( 138 const PtNodeParams *const toBeUpdatedPtNodeParams, 139 const UnigramProperty *const unigramProperty) { 140 // Update probability and historical information. 141 // TODO: Update other information in the unigram property. 142 if (!toBeUpdatedPtNodeParams->isTerminal()) { 143 return false; 144 } 145 const ProbabilityEntry originalProbabilityEntry = 146 mBuffers->getLanguageModelDictContent()->getProbabilityEntry( 147 toBeUpdatedPtNodeParams->getTerminalId()); 148 const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, 149 unigramProperty); 150 return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( 151 toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); 152} 153 154bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( 155 const PtNodeParams *const toBeUpdatedPtNodeParams, bool *const outNeedsToKeepPtNode) { 156 if (!toBeUpdatedPtNodeParams->isTerminal()) { 157 AKLOGE("updatePtNodeProbabilityAndGetNeedsToSaveForGC is called for non-terminal PtNode."); 158 return false; 159 } 160 const ProbabilityEntry originalProbabilityEntry = 161 mBuffers->getLanguageModelDictContent()->getProbabilityEntry( 162 toBeUpdatedPtNodeParams->getTerminalId()); 163 if (originalProbabilityEntry.hasHistoricalInfo()) { 164 const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( 165 originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy); 166 const ProbabilityEntry probabilityEntry = 167 originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); 168 if (!mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( 169 toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { 170 AKLOGE("Cannot write updated probability entry. terminalId: %d", 171 toBeUpdatedPtNodeParams->getTerminalId()); 172 return false; 173 } 174 const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy); 175 if (!isValid) { 176 if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { 177 AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); 178 return false; 179 } 180 } 181 *outNeedsToKeepPtNode = isValid; 182 } else { 183 // No need to update probability. 184 *outNeedsToKeepPtNode = true; 185 } 186 return true; 187} 188 189bool Ver4PatriciaTrieNodeWriter::updateChildrenPosition( 190 const PtNodeParams *const toBeUpdatedPtNodeParams, const int newChildrenPosition) { 191 int childrenPosFieldPos = toBeUpdatedPtNodeParams->getChildrenPosFieldPos(); 192 return DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, 193 newChildrenPosition, &childrenPosFieldPos); 194} 195 196bool Ver4PatriciaTrieNodeWriter::updateTerminalId(const PtNodeParams *const toBeUpdatedPtNodeParams, 197 const int newTerminalId) { 198 return mTrieBuffer->writeUint(newTerminalId, Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, 199 toBeUpdatedPtNodeParams->getTerminalIdFieldPos()); 200} 201 202bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( 203 const PtNodeParams *const ptNodeParams, int *const ptNodeWritingPos) { 204 return writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, 0 /* outTerminalId */, 205 ptNodeWritingPos); 206} 207 208 209bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( 210 const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, 211 int *const ptNodeWritingPos) { 212 int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; 213 if (!writePtNodeAndGetTerminalIdAndAdvancePosition(ptNodeParams, &terminalId, 214 ptNodeWritingPos)) { 215 return false; 216 } 217 // Write probability. 218 ProbabilityEntry newProbabilityEntry; 219 const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( 220 &newProbabilityEntry, unigramProperty); 221 return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( 222 terminalId, &probabilityEntryToWrite); 223} 224 225bool Ver4PatriciaTrieNodeWriter::addNewBigramEntry( 226 const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam, 227 const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { 228 if (!mBigramPolicy->addNewEntry(sourcePtNodeParams->getTerminalId(), 229 targetPtNodeParam->getTerminalId(), bigramProperty, outAddedNewBigram)) { 230 AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d", 231 sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId()); 232 return false; 233 } 234 return true; 235} 236 237bool Ver4PatriciaTrieNodeWriter::removeBigramEntry( 238 const PtNodeParams *const sourcePtNodeParams, const PtNodeParams *const targetPtNodeParam) { 239 return mBigramPolicy->removeEntry(sourcePtNodeParams->getTerminalId(), 240 targetPtNodeParam->getTerminalId()); 241} 242 243bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( 244 const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { 245 return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries( 246 sourcePtNodeParams->getTerminalId(), outBigramEntryCount); 247} 248 249bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( 250 const PtNodeParams *const toBeUpdatedPtNodeParams, 251 const DictPositionRelocationMap *const dictPositionRelocationMap, 252 int *const outBigramEntryCount) { 253 int parentPos = toBeUpdatedPtNodeParams->getParentPos(); 254 if (parentPos != NOT_A_DICT_POS) { 255 PtNodeWriter::PtNodePositionRelocationMap::const_iterator it = 256 dictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); 257 if (it != dictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { 258 parentPos = it->second; 259 } 260 } 261 int writingPos = toBeUpdatedPtNodeParams->getHeadPos() 262 + DynamicPtWritingUtils::NODE_FLAG_FIELD_SIZE; 263 // Write updated parent offset. 264 if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, 265 parentPos, toBeUpdatedPtNodeParams->getHeadPos(), &writingPos)) { 266 return false; 267 } 268 269 // Updates children position. 270 int childrenPos = toBeUpdatedPtNodeParams->getChildrenPos(); 271 if (childrenPos != NOT_A_DICT_POS) { 272 PtNodeWriter::PtNodeArrayPositionRelocationMap::const_iterator it = 273 dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); 274 if (it != dictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { 275 childrenPos = it->second; 276 } 277 } 278 if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { 279 return false; 280 } 281 282 // Counts bigram entries. 283 if (outBigramEntryCount) { 284 *outBigramEntryCount = mBigramPolicy->getBigramEntryConut( 285 toBeUpdatedPtNodeParams->getTerminalId()); 286 } 287 return true; 288} 289 290bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptNodeParams, 291 const int *const targetCodePoints, const int targetCodePointCount, 292 const int shortcutProbability) { 293 if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), 294 targetCodePoints, targetCodePointCount, shortcutProbability)) { 295 AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId()); 296 return false; 297 } 298 return true; 299} 300 301bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( 302 const PtNodeParams *const ptNodeParams, int *const outTerminalId, 303 int *const ptNodeWritingPos) { 304 const int nodePos = *ptNodeWritingPos; 305 // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the 306 // PtNode writing. 307 if (!DynamicPtWritingUtils::writeFlagsAndAdvancePosition(mTrieBuffer, 308 0 /* nodeFlags */, ptNodeWritingPos)) { 309 return false; 310 } 311 // Calculate a parent offset and write the offset. 312 if (!DynamicPtWritingUtils::writeParentPosOffsetAndAdvancePosition(mTrieBuffer, 313 ptNodeParams->getParentPos(), nodePos, ptNodeWritingPos)) { 314 return false; 315 } 316 // Write code points 317 if (!DynamicPtWritingUtils::writeCodePointsAndAdvancePosition(mTrieBuffer, 318 ptNodeParams->getCodePoints(), ptNodeParams->getCodePointCount(), ptNodeWritingPos)) { 319 return false; 320 } 321 int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; 322 if (!ptNodeParams->willBecomeNonTerminal()) { 323 if (ptNodeParams->getTerminalId() != Ver4DictConstants::NOT_A_TERMINAL_ID) { 324 terminalId = ptNodeParams->getTerminalId(); 325 } else if (ptNodeParams->isTerminal()) { 326 // Write terminal information using a new terminal id. 327 // Get a new unused terminal id. 328 terminalId = mBuffers->getTerminalPositionLookupTable()->getNextTerminalId(); 329 } 330 } 331 const int isTerminal = terminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; 332 if (isTerminal) { 333 // Update the lookup table. 334 if (!mBuffers->getMutableTerminalPositionLookupTable()->setTerminalPtNodePosition( 335 terminalId, nodePos)) { 336 return false; 337 } 338 // Write terminal Id. 339 if (!mTrieBuffer->writeUintAndAdvancePosition(terminalId, 340 Ver4DictConstants::TERMINAL_ID_FIELD_SIZE, ptNodeWritingPos)) { 341 return false; 342 } 343 if (outTerminalId) { 344 *outTerminalId = terminalId; 345 } 346 } 347 // Write children position 348 if (!DynamicPtWritingUtils::writeChildrenPositionAndAdvancePosition(mTrieBuffer, 349 ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { 350 return false; 351 } 352 return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), 353 isTerminal, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); 354} 355 356const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( 357 const ProbabilityEntry *const originalProbabilityEntry, 358 const UnigramProperty *const unigramProperty) const { 359 // TODO: Consolidate historical info and probability. 360 if (mHeaderPolicy->hasHistoricalInfoOfWords()) { 361 const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(), 362 unigramProperty->getLevel(), unigramProperty->getCount()); 363 const HistoricalInfo updatedHistoricalInfo = 364 ForgettingCurveUtils::createUpdatedHistoricalInfo( 365 originalProbabilityEntry->getHistoricalInfo(), 366 unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); 367 return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( 368 &updatedHistoricalInfo); 369 } else { 370 return originalProbabilityEntry->createEntryWithUpdatedProbability( 371 unigramProperty->getProbability()); 372 } 373} 374 375bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, 376 const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, 377 const bool hasMultipleChars) { 378 // Create node flags and write them. 379 PatriciaTrieReadingUtils::NodeFlags nodeFlags = 380 PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, 381 false /* hasShortcutTargets */, false /* hasBigrams */, hasMultipleChars, 382 CHILDREN_POSITION_FIELD_SIZE); 383 if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { 384 AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); 385 return false; 386 } 387 return true; 388} 389 390} // namespace latinime 391