1/* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! 19 * Do not edit this file other than updating policy's interface. 20 * 21 * This file was generated from 22 * dictionary/structure/v4/ver4_patricia_trie_policy.cpp 23 */ 24 25#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" 26 27#include <vector> 28 29#include "suggest/core/dicnode/dic_node.h" 30#include "suggest/core/dicnode/dic_node_vector.h" 31#include "dictionary/interface/ngram_listener.h" 32#include "dictionary/property/ngram_context.h" 33#include "dictionary/property/ngram_property.h" 34#include "dictionary/property/unigram_property.h" 35#include "dictionary/property/word_property.h" 36#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 37#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" 38#include "dictionary/utils/forgetting_curve_utils.h" 39#include "dictionary/utils/multi_bigram_map.h" 40#include "dictionary/utils/probability_utils.h" 41 42namespace latinime { 43namespace backward { 44namespace v402 { 45 46// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and 47// BinaryDictionaryDecayingTests. 48const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 49const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 50const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 51const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 52const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; 53const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = 54 Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 55const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1; 56 57void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, 58 DicNodeVector *const childDicNodes) const { 59 if (!dicNode->hasChildren()) { 60 return; 61 } 62 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 63 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); 64 while (!readingHelper.isEnd()) { 65 const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); 66 if (!ptNodeParams.isValid()) { 67 break; 68 } 69 bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); 70 if (isTerminal && mHeaderPolicy->isDecayingDict()) { 71 // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose 72 // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a 73 // valid terminal DicNode. 74 isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; 75 } 76 readingHelper.readNextSiblingNode(ptNodeParams); 77 if (ptNodeParams.representsNonWordInfo()) { 78 // Skip PtNodes that represent non-word information. 79 continue; 80 } 81 const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID; 82 childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), 83 wordId, ptNodeParams.getCodePointArrayView()); 84 } 85 if (readingHelper.isError()) { 86 mIsCorrupted = true; 87 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 88 } 89} 90 91int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, 92 const int maxCodePointCount, int *const outCodePoints) const { 93 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 94 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); 95 readingHelper.initWithPtNodePos(ptNodePos); 96 const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( 97 maxCodePointCount, outCodePoints); 98 if (readingHelper.isError()) { 99 mIsCorrupted = true; 100 AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); 101 } 102 return codePointCount; 103} 104 105int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, 106 const bool forceLowerCaseSearch) const { 107 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 108 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 109 const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), 110 wordCodePoints.size(), forceLowerCaseSearch); 111 if (readingHelper.isError()) { 112 mIsCorrupted = true; 113 AKLOGE("Dictionary reading error in getWordId()."); 114 } 115 return getWordIdFromTerminalPtNodePos(ptNodePos); 116} 117 118const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( 119 const WordIdArrayView prevWordIds, const int wordId, 120 MultiBigramMap *const multiBigramMap) const { 121 if (wordId == NOT_A_WORD_ID) { 122 return WordAttributes(); 123 } 124 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); 125 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 126 if (multiBigramMap) { 127 const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, 128 prevWordIds, wordId, ptNodeParams.getProbability()); 129 return getWordAttributes(probability, ptNodeParams); 130 } 131 if (!prevWordIds.empty()) { 132 const int probability = getProbabilityOfWord(prevWordIds, wordId); 133 if (probability != NOT_A_PROBABILITY) { 134 return getWordAttributes(probability, ptNodeParams); 135 } 136 } 137 return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), 138 ptNodeParams); 139} 140 141const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability, 142 const PtNodeParams &ptNodeParams) const { 143 return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), 144 ptNodeParams.getProbability() == 0); 145} 146 147int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, 148 const int bigramProbability) const { 149 // In the v4 format, bigramProbability is a conditional probability. 150 const int bigramConditionalProbability = bigramProbability; 151 if (unigramProbability == NOT_A_PROBABILITY) { 152 return NOT_A_PROBABILITY; 153 } 154 if (bigramConditionalProbability == NOT_A_PROBABILITY) { 155 return ProbabilityUtils::backoff(unigramProbability); 156 } 157 return bigramConditionalProbability; 158} 159 160int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, 161 const int wordId) const { 162 if (wordId == NOT_A_WORD_ID) { 163 return NOT_A_PROBABILITY; 164 } 165 const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); 166 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 167 if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) { 168 return NOT_A_PROBABILITY; 169 } 170 if (prevWordIds.empty()) { 171 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); 172 } 173 if (prevWordIds[0] == NOT_A_WORD_ID) { 174 return NOT_A_PROBABILITY; 175 } 176 const PtNodeParams prevWordPtNodeParams = 177 mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); 178 if (prevWordPtNodeParams.isDeleted()) { 179 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); 180 } 181 const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( 182 prevWordPtNodeParams.getTerminalId()); 183 BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); 184 while (bigramsIt.hasNext()) { 185 bigramsIt.next(); 186 if (bigramsIt.getBigramPos() == ptNodePos 187 && bigramsIt.getProbability() != NOT_A_PROBABILITY) { 188 const int bigramConditionalProbability = getBigramConditionalProbability( 189 prevWordPtNodeParams.getProbability(), 190 prevWordPtNodeParams.representsBeginningOfSentence(), 191 bigramsIt.getProbability()); 192 return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability); 193 } 194 } 195 return NOT_A_PROBABILITY; 196} 197 198void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, 199 NgramListener *const listener) const { 200 if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) { 201 return; 202 } 203 const PtNodeParams prevWordPtNodeParams = 204 mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); 205 if (prevWordPtNodeParams.isDeleted()) { 206 return; 207 } 208 const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( 209 prevWordPtNodeParams.getTerminalId()); 210 BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); 211 while (bigramsIt.hasNext()) { 212 bigramsIt.next(); 213 const int bigramConditionalProbability = getBigramConditionalProbability( 214 prevWordPtNodeParams.getProbability(), 215 prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability()); 216 listener->onVisitEntry(bigramConditionalProbability, 217 getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); 218 } 219} 220 221int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability, 222 const bool isInBeginningOfSentenceContext, const int bigramProbability) const { 223 if (mHeaderPolicy->hasHistoricalInfoOfWords()) { 224 if (isInBeginningOfSentenceContext) { 225 return bigramProbability; 226 } 227 // Calculate conditional probability. 228 return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability, 229 MAX_PROBABILITY); 230 } else { 231 // bigramProbability is a conditional probability. 232 return bigramProbability; 233 } 234} 235 236BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( 237 const int wordId) const { 238 const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); 239 return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); 240} 241 242int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { 243 if (ptNodePos == NOT_A_DICT_POS) { 244 return NOT_A_DICT_POS; 245 } 246 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 247 if (ptNodeParams.isDeleted()) { 248 return NOT_A_DICT_POS; 249 } 250 return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( 251 ptNodeParams.getTerminalId()); 252} 253 254int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { 255 if (ptNodePos == NOT_A_DICT_POS) { 256 return NOT_A_DICT_POS; 257 } 258 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 259 if (ptNodeParams.isDeleted()) { 260 return NOT_A_DICT_POS; 261 } 262 return mBuffers->getBigramDictContent()->getBigramListHeadPos( 263 ptNodeParams.getTerminalId()); 264} 265 266bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, 267 const UnigramProperty *const unigramProperty) { 268 if (!mBuffers->isUpdatable()) { 269 AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); 270 return false; 271 } 272 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 273 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 274 mDictBuffer->getTailPosition()); 275 return false; 276 } 277 if (wordCodePoints.size() > MAX_WORD_LENGTH) { 278 AKLOGE("The word is too long to insert to the dictionary, length: %zd", 279 wordCodePoints.size()); 280 return false; 281 } 282 for (const auto &shortcut : unigramProperty->getShortcuts()) { 283 if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 284 AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", 285 shortcut.getTargetCodePoints()->size()); 286 return false; 287 } 288 } 289 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 290 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 291 bool addedNewUnigram = false; 292 int codePointsToAdd[MAX_WORD_LENGTH]; 293 int codePointCountToAdd = wordCodePoints.size(); 294 memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); 295 if (unigramProperty->representsBeginningOfSentence()) { 296 codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, 297 codePointCountToAdd, MAX_WORD_LENGTH); 298 } 299 if (codePointCountToAdd <= 0) { 300 return false; 301 } 302 const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); 303 if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, 304 &addedNewUnigram)) { 305 if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { 306 mEntryCounters.incrementNgramCount(NgramType::Unigram); 307 } 308 if (unigramProperty->getShortcuts().size() > 0) { 309 // Add shortcut target. 310 const int wordPos = getTerminalPtNodePosFromWordId( 311 getWordId(codePointArrayView, false /* forceLowerCaseSearch */)); 312 if (wordPos == NOT_A_DICT_POS) { 313 AKLOGE("Cannot find terminal PtNode position to add shortcut target."); 314 return false; 315 } 316 for (const auto &shortcut : unigramProperty->getShortcuts()) { 317 if (!mUpdatingHelper.addShortcutTarget(wordPos, 318 CodePointArrayView(*shortcut.getTargetCodePoints()), 319 shortcut.getProbability())) { 320 AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " 321 "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), 322 shortcut.getProbability()); 323 return false; 324 } 325 } 326 } 327 return true; 328 } else { 329 return false; 330 } 331} 332 333bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { 334 if (!mBuffers->isUpdatable()) { 335 AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); 336 return false; 337 } 338 const int ptNodePos = getTerminalPtNodePosFromWordId( 339 getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); 340 if (ptNodePos == NOT_A_DICT_POS) { 341 return false; 342 } 343 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 344 return mNodeWriter.suppressUnigramEntry(&ptNodeParams); 345} 346 347bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { 348 if (!mBuffers->isUpdatable()) { 349 AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); 350 return false; 351 } 352 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 353 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 354 mDictBuffer->getTailPosition()); 355 return false; 356 } 357 const NgramContext *const ngramContext = ngramProperty->getNgramContext(); 358 if (!ngramContext->isValid()) { 359 AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); 360 return false; 361 } 362 if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 363 AKLOGE("The word is too long to insert the ngram to the dictionary. " 364 "length: %zd", ngramProperty->getTargetCodePoints()->size()); 365 return false; 366 } 367 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; 368 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, 369 false /* tryLowerCaseSearch */); 370 if (prevWordIds.empty()) { 371 return false; 372 } 373 if (prevWordIds[0] == NOT_A_WORD_ID) { 374 if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { 375 const UnigramProperty beginningOfSentenceUnigramProperty( 376 true /* representsBeginningOfSentence */, true /* isNotAWord */, 377 false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo()); 378 if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), 379 &beginningOfSentenceUnigramProperty)) { 380 AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); 381 return false; 382 } 383 // Refresh word ids. 384 ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); 385 } else { 386 return false; 387 } 388 } 389 const int wordPos = getTerminalPtNodePosFromWordId(getWordId( 390 CodePointArrayView(*ngramProperty->getTargetCodePoints()), 391 false /* forceLowerCaseSearch */)); 392 if (wordPos == NOT_A_DICT_POS) { 393 return false; 394 } 395 bool addedNewBigram = false; 396 const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); 397 if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos), 398 wordPos, ngramProperty, &addedNewBigram)) { 399 if (addedNewBigram) { 400 mEntryCounters.incrementNgramCount(NgramType::Bigram); 401 } 402 return true; 403 } else { 404 return false; 405 } 406} 407 408bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, 409 const CodePointArrayView wordCodePoints) { 410 if (!mBuffers->isUpdatable()) { 411 AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); 412 return false; 413 } 414 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 415 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 416 mDictBuffer->getTailPosition()); 417 return false; 418 } 419 if (!ngramContext->isValid()) { 420 AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); 421 return false; 422 } 423 if (wordCodePoints.size() > MAX_WORD_LENGTH) { 424 AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", 425 wordCodePoints.size()); 426 } 427 WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; 428 const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, 429 false /* tryLowerCaseSerch */); 430 if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { 431 return false; 432 } 433 const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints, 434 false /* forceLowerCaseSearch */)); 435 if (wordPos == NOT_A_DICT_POS) { 436 return false; 437 } 438 const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); 439 if (mUpdatingHelper.removeNgramEntry( 440 PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) { 441 mEntryCounters.decrementNgramCount(NgramType::Bigram); 442 return true; 443 } else { 444 return false; 445 } 446} 447 448 449bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( 450 const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, 451 const bool isValidWord, const HistoricalInfo historicalInfo) { 452 if (!mBuffers->isUpdatable()) { 453 AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " 454 "dictionary."); 455 return false; 456 } 457 const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY; 458 const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, 459 false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo); 460 if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { 461 AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); 462 return false; 463 } 464 const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) 465 ? NOT_A_PROBABILITY : probability; 466 const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram, 467 historicalInfo); 468 if (!addNgramEntry(&ngramProperty)) { 469 AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); 470 return false; 471 } 472 return true; 473} 474 475bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { 476 if (!mBuffers->isUpdatable()) { 477 AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); 478 return false; 479 } 480 if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { 481 AKLOGE("Cannot flush the dictionary to file."); 482 mIsCorrupted = true; 483 return false; 484 } 485 return true; 486} 487 488bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { 489 if (!mBuffers->isUpdatable()) { 490 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); 491 return false; 492 } 493 if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { 494 AKLOGE("Cannot flush the dictionary to file with GC."); 495 mIsCorrupted = true; 496 return false; 497 } 498 return true; 499} 500 501bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { 502 if (!mBuffers->isUpdatable()) { 503 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); 504 return false; 505 } 506 if (mBuffers->isNearSizeLimit()) { 507 // Additional buffer size is near the limit. 508 return true; 509 } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() 510 > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { 511 // Total extended region size of the trie exceeds the limit. 512 return true; 513 } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS 514 && mDictBuffer->getUsedAdditionalBufferSize() > 0) { 515 // Needs to reduce dictionary size. 516 return true; 517 } else if (mHeaderPolicy->isDecayingDict()) { 518 return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), 519 mHeaderPolicy); 520 } 521 return false; 522} 523 524void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, 525 char *const outResult, const int maxResultLength) { 526 const int compareLength = queryLength + 1 /* terminator */; 527 if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { 528 snprintf(outResult, maxResultLength, "%d", 529 mEntryCounters.getNgramCount(NgramType::Unigram)); 530 } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { 531 snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); 532 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { 533 snprintf(outResult, maxResultLength, "%d", 534 mHeaderPolicy->isDecayingDict() ? 535 ForgettingCurveUtils::getEntryCountHardLimit( 536 mHeaderPolicy->getMaxNgramCounts().getNgramCount( 537 NgramType::Unigram)) : 538 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 539 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { 540 snprintf(outResult, maxResultLength, "%d", 541 mHeaderPolicy->isDecayingDict() ? 542 ForgettingCurveUtils::getEntryCountHardLimit( 543 mHeaderPolicy->getMaxNgramCounts().getNgramCount( 544 NgramType::Bigram)) : 545 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 546 } 547} 548 549const WordProperty Ver4PatriciaTriePolicy::getWordProperty( 550 const CodePointArrayView wordCodePoints) const { 551 const int ptNodePos = getTerminalPtNodePosFromWordId( 552 getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); 553 if (ptNodePos == NOT_A_DICT_POS) { 554 AKLOGE("getWordProperty is called for invalid word."); 555 return WordProperty(); 556 } 557 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 558 const ProbabilityEntry probabilityEntry = 559 mBuffers->getProbabilityDictContent()->getProbabilityEntry( 560 ptNodeParams.getTerminalId()); 561 const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); 562 // Fetch bigram information. 563 std::vector<NgramProperty> ngrams; 564 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); 565 if (bigramListPos != NOT_A_DICT_POS) { 566 int bigramWord1CodePoints[MAX_WORD_LENGTH]; 567 const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); 568 const TerminalPositionLookupTable *const terminalPositionLookupTable = 569 mBuffers->getTerminalPositionLookupTable(); 570 bool hasNext = true; 571 int readingPos = bigramListPos; 572 while (hasNext) { 573 const BigramEntry bigramEntry = 574 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); 575 hasNext = bigramEntry.hasNext(); 576 const int word1TerminalId = bigramEntry.getTargetTerminalId(); 577 const int word1TerminalPtNodePos = 578 terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); 579 if (word1TerminalPtNodePos == NOT_A_DICT_POS) { 580 continue; 581 } 582 const int codePointCount = getCodePointsAndReturnCodePointCount( 583 getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH, 584 bigramWord1CodePoints); 585 const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); 586 const int rawBigramProbability = bigramEntry.hasHistoricalInfo() 587 ? ForgettingCurveUtils::decodeProbability( 588 bigramEntry.getHistoricalInfo(), mHeaderPolicy) 589 : bigramEntry.getProbability(); 590 const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(), 591 ptNodeParams.representsBeginningOfSentence(), rawBigramProbability); 592 ngrams.emplace_back( 593 NgramContext(wordCodePoints.data(), wordCodePoints.size(), 594 ptNodeParams.representsBeginningOfSentence()), 595 CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(), 596 probability, *historicalInfo); 597 } 598 } 599 // Fetch shortcut information. 600 std::vector<UnigramProperty::ShortcutProperty> shortcuts; 601 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); 602 if (shortcutPos != NOT_A_DICT_POS) { 603 int shortcutTarget[MAX_WORD_LENGTH]; 604 const ShortcutDictContent *const shortcutDictContent = 605 mBuffers->getShortcutDictContent(); 606 bool hasNext = true; 607 while (hasNext) { 608 int shortcutTargetLength = 0; 609 int shortcutProbability = NOT_A_PROBABILITY; 610 shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, 611 &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); 612 shortcuts.emplace_back( 613 CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), 614 shortcutProbability); 615 } 616 } 617 const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), 618 ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), 619 ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts)); 620 return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); 621} 622 623int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, 624 int *const outCodePointCount) { 625 *outCodePointCount = 0; 626 if (token == 0) { 627 mTerminalPtNodePositionsForIteratingWords.clear(); 628 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( 629 &mTerminalPtNodePositionsForIteratingWords); 630 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 631 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 632 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); 633 } 634 const int terminalPtNodePositionsVectorSize = 635 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); 636 if (token < 0 || token >= terminalPtNodePositionsVectorSize) { 637 AKLOGE("Given token %d is invalid.", token); 638 return 0; 639 } 640 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; 641 *outCodePointCount = getCodePointsAndReturnCodePointCount( 642 getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); 643 const int nextToken = token + 1; 644 if (nextToken >= terminalPtNodePositionsVectorSize) { 645 // All words have been iterated. 646 mTerminalPtNodePositionsForIteratingWords.clear(); 647 return 0; 648 } 649 return nextToken; 650} 651 652int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { 653 return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; 654} 655 656int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { 657 return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; 658} 659 660} // namespace v402 661} // namespace backward 662} // namespace latinime 663