ver4_patricia_trie_policy.cpp revision bd1f59bda5ad0b7028ec06c2de078f1623e76cdd
1/* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* 18 * !!!!! DO NOT CHANGE THE LOGIC IN THIS FILE !!!!! 19 * Do not edit this file other than updating policy's interface. 20 * 21 * This file was generated from 22 * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp 23 */ 24 25#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" 26 27#include <vector> 28 29#include "suggest/core/dicnode/dic_node.h" 30#include "suggest/core/dicnode/dic_node_vector.h" 31#include "suggest/core/dictionary/ngram_listener.h" 32#include "suggest/core/dictionary/property/bigram_property.h" 33#include "suggest/core/dictionary/property/unigram_property.h" 34#include "suggest/core/dictionary/property/word_property.h" 35#include "suggest/core/session/prev_words_info.h" 36#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 37#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" 38#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" 39#include "suggest/policyimpl/dictionary/utils/probability_utils.h" 40 41namespace latinime { 42namespace backward { 43namespace v402 { 44 45// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and 46// BinaryDictionaryDecayingTests. 47const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 48const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 49const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 50const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 51const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; 52const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = 53 Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 54 55void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, 56 DicNodeVector *const childDicNodes) const { 57 if (!dicNode->hasChildren()) { 58 return; 59 } 60 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 61 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); 62 while (!readingHelper.isEnd()) { 63 const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); 64 if (!ptNodeParams.isValid()) { 65 break; 66 } 67 bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); 68 if (isTerminal && mHeaderPolicy->isDecayingDict()) { 69 // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose 70 // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a 71 // valid terminal DicNode. 72 isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; 73 } 74 readingHelper.readNextSiblingNode(ptNodeParams); 75 if (ptNodeParams.representsNonWordInfo()) { 76 // Skip PtNodes that represent non-word information. 77 continue; 78 } 79 childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), 80 ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, 81 ptNodeParams.hasChildren(), 82 ptNodeParams.isBlacklisted() 83 || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, 84 ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); 85 } 86 if (readingHelper.isError()) { 87 mIsCorrupted = true; 88 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 89 } 90} 91 92int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( 93 const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, 94 int *const outUnigramProbability) const { 95 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 96 readingHelper.initWithPtNodePos(ptNodePos); 97 const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( 98 maxCodePointCount, outCodePoints, outUnigramProbability); 99 if (readingHelper.isError()) { 100 mIsCorrupted = true; 101 AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); 102 } 103 return codePointCount; 104} 105 106int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, 107 const int length, const bool forceLowerCaseSearch) const { 108 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 109 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 110 const int ptNodePos = 111 readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); 112 if (readingHelper.isError()) { 113 mIsCorrupted = true; 114 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 115 } 116 return ptNodePos; 117} 118 119int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, 120 const int bigramProbability) const { 121 if (mHeaderPolicy->isDecayingDict()) { 122 // Both probabilities are encoded. Decode them and get probability. 123 return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); 124 } else { 125 if (unigramProbability == NOT_A_PROBABILITY) { 126 return NOT_A_PROBABILITY; 127 } else if (bigramProbability == NOT_A_PROBABILITY) { 128 return ProbabilityUtils::backoff(unigramProbability); 129 } else { 130 return bigramProbability; 131 } 132 } 133} 134 135int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos, 136 const int ptNodePos) const { 137 if (ptNodePos == NOT_A_DICT_POS) { 138 return NOT_A_PROBABILITY; 139 } 140 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 141 if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { 142 return NOT_A_PROBABILITY; 143 } 144 if (prevWordsPtNodePos) { 145 BinaryDictionaryBigramsIterator bigramsIt = 146 getBigramsIteratorOfPtNode(prevWordsPtNodePos[0]); 147 while (bigramsIt.hasNext()) { 148 bigramsIt.next(); 149 if (bigramsIt.getBigramPos() == ptNodePos 150 && bigramsIt.getProbability() != NOT_A_PROBABILITY) { 151 return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); 152 } 153 } 154 return NOT_A_PROBABILITY; 155 } 156 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); 157} 158 159void Ver4PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos, 160 NgramListener *const listener) const { 161 if (!prevWordsPtNodePos) { 162 return; 163 } 164 BinaryDictionaryBigramsIterator bigramsIt = getBigramsIteratorOfPtNode(prevWordsPtNodePos[0]); 165 while (bigramsIt.hasNext()) { 166 bigramsIt.next(); 167 listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos()); 168 } 169} 170 171int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { 172 if (ptNodePos == NOT_A_DICT_POS) { 173 return NOT_A_DICT_POS; 174 } 175 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 176 if (ptNodeParams.isDeleted()) { 177 return NOT_A_DICT_POS; 178 } 179 return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( 180 ptNodeParams.getTerminalId()); 181} 182 183BinaryDictionaryBigramsIterator Ver4PatriciaTriePolicy::getBigramsIteratorOfPtNode( 184 const int ptNodePos) const { 185 const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos); 186 return BinaryDictionaryBigramsIterator(&mBigramPolicy, bigramsPosition); 187} 188 189int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { 190 if (ptNodePos == NOT_A_DICT_POS) { 191 return NOT_A_DICT_POS; 192 } 193 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 194 if (ptNodeParams.isDeleted()) { 195 return NOT_A_DICT_POS; 196 } 197 return mBuffers->getBigramDictContent()->getBigramListHeadPos( 198 ptNodeParams.getTerminalId()); 199} 200 201bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length, 202 const UnigramProperty *const unigramProperty) { 203 if (!mBuffers->isUpdatable()) { 204 AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); 205 return false; 206 } 207 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 208 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 209 mDictBuffer->getTailPosition()); 210 return false; 211 } 212 if (length > MAX_WORD_LENGTH) { 213 AKLOGE("The word is too long to insert to the dictionary, length: %d", length); 214 return false; 215 } 216 for (const auto &shortcut : unigramProperty->getShortcuts()) { 217 if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 218 AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", 219 shortcut.getTargetCodePoints()->size()); 220 return false; 221 } 222 } 223 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 224 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 225 bool addedNewUnigram = false; 226 int codePointsToAdd[MAX_WORD_LENGTH]; 227 int codePointCountToAdd = length; 228 memmove(codePointsToAdd, word, sizeof(int) * length); 229 if (unigramProperty->representsBeginningOfSentence()) { 230 codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, 231 codePointCountToAdd, MAX_WORD_LENGTH); 232 } 233 if (codePointCountToAdd <= 0) { 234 return false; 235 } 236 if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, 237 unigramProperty, &addedNewUnigram)) { 238 if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { 239 mUnigramCount++; 240 } 241 if (unigramProperty->getShortcuts().size() > 0) { 242 // Add shortcut target. 243 const int wordPos = getTerminalPtNodePositionOfWord(word, length, 244 false /* forceLowerCaseSearch */); 245 if (wordPos == NOT_A_DICT_POS) { 246 AKLOGE("Cannot find terminal PtNode position to add shortcut target."); 247 return false; 248 } 249 for (const auto &shortcut : unigramProperty->getShortcuts()) { 250 if (!mUpdatingHelper.addShortcutTarget(wordPos, 251 shortcut.getTargetCodePoints()->data(), 252 shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { 253 AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " 254 "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), 255 shortcut.getProbability()); 256 return false; 257 } 258 } 259 } 260 return true; 261 } else { 262 return false; 263 } 264} 265 266bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, 267 const BigramProperty *const bigramProperty) { 268 if (!mBuffers->isUpdatable()) { 269 AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); 270 return false; 271 } 272 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 273 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 274 mDictBuffer->getTailPosition()); 275 return false; 276 } 277 if (!prevWordsInfo->isValid()) { 278 AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary."); 279 return false; 280 } 281 if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 282 AKLOGE("The word is too long to insert the ngram to the dictionary. " 283 "length: %d", bigramProperty->getTargetCodePoints()->size()); 284 return false; 285 } 286 int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 287 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 288 false /* tryLowerCaseSearch */); 289 // TODO: Support N-gram. 290 if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 291 if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { 292 const std::vector<UnigramProperty::ShortcutProperty> shortcuts; 293 const UnigramProperty beginningOfSentenceUnigramProperty( 294 true /* representsBeginningOfSentence */, true /* isNotAWord */, 295 false /* isBlacklisted */, MAX_PROBABILITY /* probability */, 296 NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); 297 if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */), 298 prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */), 299 &beginningOfSentenceUnigramProperty)) { 300 AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); 301 return false; 302 } 303 // Refresh Terminal PtNode positions. 304 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 305 false /* tryLowerCaseSearch */); 306 } else { 307 return false; 308 } 309 } 310 const int word1Pos = getTerminalPtNodePositionOfWord( 311 bigramProperty->getTargetCodePoints()->data(), 312 bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); 313 if (word1Pos == NOT_A_DICT_POS) { 314 return false; 315 } 316 bool addedNewBigram = false; 317 if (mUpdatingHelper.addBigramWords(prevWordsPtNodePos[0], word1Pos, bigramProperty, 318 &addedNewBigram)) { 319 if (addedNewBigram) { 320 mBigramCount++; 321 } 322 return true; 323 } else { 324 return false; 325 } 326} 327 328bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, 329 const int *const word, const int length) { 330 if (!mBuffers->isUpdatable()) { 331 AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); 332 return false; 333 } 334 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 335 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 336 mDictBuffer->getTailPosition()); 337 return false; 338 } 339 if (!prevWordsInfo->isValid()) { 340 AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary."); 341 return false; 342 } 343 if (length > MAX_WORD_LENGTH) { 344 AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length); 345 } 346 int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 347 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 348 false /* tryLowerCaseSerch */); 349 // TODO: Support N-gram. 350 if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 351 return false; 352 } 353 const int wordPos = getTerminalPtNodePositionOfWord(word, length, 354 false /* forceLowerCaseSearch */); 355 if (wordPos == NOT_A_DICT_POS) { 356 return false; 357 } 358 if (mUpdatingHelper.removeBigramWords(prevWordsPtNodePos[0], wordPos)) { 359 mBigramCount--; 360 return true; 361 } else { 362 return false; 363 } 364} 365 366bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { 367 if (!mBuffers->isUpdatable()) { 368 AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); 369 return false; 370 } 371 if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { 372 AKLOGE("Cannot flush the dictionary to file."); 373 mIsCorrupted = true; 374 return false; 375 } 376 return true; 377} 378 379bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { 380 if (!mBuffers->isUpdatable()) { 381 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); 382 return false; 383 } 384 if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { 385 AKLOGE("Cannot flush the dictionary to file with GC."); 386 mIsCorrupted = true; 387 return false; 388 } 389 return true; 390} 391 392bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { 393 if (!mBuffers->isUpdatable()) { 394 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); 395 return false; 396 } 397 if (mBuffers->isNearSizeLimit()) { 398 // Additional buffer size is near the limit. 399 return true; 400 } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() 401 > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { 402 // Total extended region size of the trie exceeds the limit. 403 return true; 404 } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS 405 && mDictBuffer->getUsedAdditionalBufferSize() > 0) { 406 // Needs to reduce dictionary size. 407 return true; 408 } else if (mHeaderPolicy->isDecayingDict()) { 409 return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, 410 mHeaderPolicy); 411 } 412 return false; 413} 414 415void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, 416 char *const outResult, const int maxResultLength) { 417 const int compareLength = queryLength + 1 /* terminator */; 418 if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { 419 snprintf(outResult, maxResultLength, "%d", mUnigramCount); 420 } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { 421 snprintf(outResult, maxResultLength, "%d", mBigramCount); 422 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { 423 snprintf(outResult, maxResultLength, "%d", 424 mHeaderPolicy->isDecayingDict() ? 425 ForgettingCurveUtils::getUnigramCountHardLimit( 426 mHeaderPolicy->getMaxUnigramCount()) : 427 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 428 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { 429 snprintf(outResult, maxResultLength, "%d", 430 mHeaderPolicy->isDecayingDict() ? 431 ForgettingCurveUtils::getBigramCountHardLimit( 432 mHeaderPolicy->getMaxBigramCount()) : 433 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 434 } 435} 436 437const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, 438 const int codePointCount) const { 439 const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, 440 false /* forceLowerCaseSearch */); 441 if (ptNodePos == NOT_A_DICT_POS) { 442 AKLOGE("getWordProperty is called for invalid word."); 443 return WordProperty(); 444 } 445 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 446 std::vector<int> codePointVector(ptNodeParams.getCodePoints(), 447 ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); 448 const ProbabilityEntry probabilityEntry = 449 mBuffers->getProbabilityDictContent()->getProbabilityEntry( 450 ptNodeParams.getTerminalId()); 451 const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); 452 // Fetch bigram information. 453 std::vector<BigramProperty> bigrams; 454 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); 455 if (bigramListPos != NOT_A_DICT_POS) { 456 int bigramWord1CodePoints[MAX_WORD_LENGTH]; 457 const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); 458 const TerminalPositionLookupTable *const terminalPositionLookupTable = 459 mBuffers->getTerminalPositionLookupTable(); 460 bool hasNext = true; 461 int readingPos = bigramListPos; 462 while (hasNext) { 463 const BigramEntry bigramEntry = 464 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); 465 hasNext = bigramEntry.hasNext(); 466 const int word1TerminalId = bigramEntry.getTargetTerminalId(); 467 const int word1TerminalPtNodePos = 468 terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); 469 if (word1TerminalPtNodePos == NOT_A_DICT_POS) { 470 continue; 471 } 472 // Word (unigram) probability 473 int word1Probability = NOT_A_PROBABILITY; 474 const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 475 word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, 476 &word1Probability); 477 const std::vector<int> word1(bigramWord1CodePoints, 478 bigramWord1CodePoints + codePointCount); 479 const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); 480 const int probability = bigramEntry.hasHistoricalInfo() ? 481 ForgettingCurveUtils::decodeProbability( 482 bigramEntry.getHistoricalInfo(), mHeaderPolicy) : 483 bigramEntry.getProbability(); 484 bigrams.emplace_back(&word1, probability, 485 historicalInfo->getTimeStamp(), historicalInfo->getLevel(), 486 historicalInfo->getCount()); 487 } 488 } 489 // Fetch shortcut information. 490 std::vector<UnigramProperty::ShortcutProperty> shortcuts; 491 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); 492 if (shortcutPos != NOT_A_DICT_POS) { 493 int shortcutTarget[MAX_WORD_LENGTH]; 494 const ShortcutDictContent *const shortcutDictContent = 495 mBuffers->getShortcutDictContent(); 496 bool hasNext = true; 497 while (hasNext) { 498 int shortcutTargetLength = 0; 499 int shortcutProbability = NOT_A_PROBABILITY; 500 shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, 501 &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); 502 const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); 503 shortcuts.emplace_back(&target, shortcutProbability); 504 } 505 } 506 const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), 507 ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), 508 historicalInfo->getTimeStamp(), historicalInfo->getLevel(), 509 historicalInfo->getCount(), &shortcuts); 510 return WordProperty(&codePointVector, &unigramProperty, &bigrams); 511} 512 513int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, 514 int *const outCodePointCount) { 515 *outCodePointCount = 0; 516 if (token == 0) { 517 mTerminalPtNodePositionsForIteratingWords.clear(); 518 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( 519 &mTerminalPtNodePositionsForIteratingWords); 520 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 521 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 522 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); 523 } 524 const int terminalPtNodePositionsVectorSize = 525 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); 526 if (token < 0 || token >= terminalPtNodePositionsVectorSize) { 527 AKLOGE("Given token %d is invalid.", token); 528 return 0; 529 } 530 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; 531 int unigramProbability = NOT_A_PROBABILITY; 532 *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 533 terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); 534 const int nextToken = token + 1; 535 if (nextToken >= terminalPtNodePositionsVectorSize) { 536 // All words have been iterated. 537 mTerminalPtNodePositionsForIteratingWords.clear(); 538 return 0; 539 } 540 return nextToken; 541} 542 543} // namespace v402 544} // namespace backward 545} // namespace latinime 546