ver4_patricia_trie_policy.cpp revision bd1f59bda5ad0b7028ec06c2de078f1623e76cdd
1/* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" 18 19#include <vector> 20 21#include "suggest/core/dicnode/dic_node.h" 22#include "suggest/core/dicnode/dic_node_vector.h" 23#include "suggest/core/dictionary/ngram_listener.h" 24#include "suggest/core/dictionary/property/bigram_property.h" 25#include "suggest/core/dictionary/property/unigram_property.h" 26#include "suggest/core/dictionary/property/word_property.h" 27#include "suggest/core/session/prev_words_info.h" 28#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 29#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" 30#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" 31#include "suggest/policyimpl/dictionary/utils/probability_utils.h" 32 33namespace latinime { 34 35// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and 36// BinaryDictionaryDecayingTests. 37const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 38const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 39const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 40const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 41const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; 42const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = 43 Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 44 45void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, 46 DicNodeVector *const childDicNodes) const { 47 if (!dicNode->hasChildren()) { 48 return; 49 } 50 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 51 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); 52 while (!readingHelper.isEnd()) { 53 const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); 54 if (!ptNodeParams.isValid()) { 55 break; 56 } 57 bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); 58 if (isTerminal && mHeaderPolicy->isDecayingDict()) { 59 // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose 60 // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a 61 // valid terminal DicNode. 62 isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; 63 } 64 readingHelper.readNextSiblingNode(ptNodeParams); 65 if (ptNodeParams.representsNonWordInfo()) { 66 // Skip PtNodes that represent non-word information. 67 continue; 68 } 69 childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), 70 ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, 71 ptNodeParams.hasChildren(), 72 ptNodeParams.isBlacklisted() 73 || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, 74 ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); 75 } 76 if (readingHelper.isError()) { 77 mIsCorrupted = true; 78 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 79 } 80} 81 82int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( 83 const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, 84 int *const outUnigramProbability) const { 85 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 86 readingHelper.initWithPtNodePos(ptNodePos); 87 const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( 88 maxCodePointCount, outCodePoints, outUnigramProbability); 89 if (readingHelper.isError()) { 90 mIsCorrupted = true; 91 AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); 92 } 93 return codePointCount; 94} 95 96int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, 97 const int length, const bool forceLowerCaseSearch) const { 98 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 99 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 100 const int ptNodePos = 101 readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); 102 if (readingHelper.isError()) { 103 mIsCorrupted = true; 104 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 105 } 106 return ptNodePos; 107} 108 109int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, 110 const int bigramProbability) const { 111 if (mHeaderPolicy->isDecayingDict()) { 112 // Both probabilities are encoded. Decode them and get probability. 113 return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); 114 } else { 115 if (unigramProbability == NOT_A_PROBABILITY) { 116 return NOT_A_PROBABILITY; 117 } else if (bigramProbability == NOT_A_PROBABILITY) { 118 return ProbabilityUtils::backoff(unigramProbability); 119 } else { 120 return bigramProbability; 121 } 122 } 123} 124 125int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos, 126 const int ptNodePos) const { 127 if (ptNodePos == NOT_A_DICT_POS) { 128 return NOT_A_PROBABILITY; 129 } 130 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 131 if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { 132 return NOT_A_PROBABILITY; 133 } 134 if (prevWordsPtNodePos) { 135 BinaryDictionaryBigramsIterator bigramsIt = 136 getBigramsIteratorOfPtNode(prevWordsPtNodePos[0]); 137 while (bigramsIt.hasNext()) { 138 bigramsIt.next(); 139 if (bigramsIt.getBigramPos() == ptNodePos 140 && bigramsIt.getProbability() != NOT_A_PROBABILITY) { 141 return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); 142 } 143 } 144 return NOT_A_PROBABILITY; 145 } 146 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); 147} 148 149void Ver4PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos, 150 NgramListener *const listener) const { 151 if (!prevWordsPtNodePos) { 152 return; 153 } 154 BinaryDictionaryBigramsIterator bigramsIt = getBigramsIteratorOfPtNode(prevWordsPtNodePos[0]); 155 while (bigramsIt.hasNext()) { 156 bigramsIt.next(); 157 listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos()); 158 } 159} 160 161int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { 162 if (ptNodePos == NOT_A_DICT_POS) { 163 return NOT_A_DICT_POS; 164 } 165 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 166 if (ptNodeParams.isDeleted()) { 167 return NOT_A_DICT_POS; 168 } 169 return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( 170 ptNodeParams.getTerminalId()); 171} 172 173BinaryDictionaryBigramsIterator Ver4PatriciaTriePolicy::getBigramsIteratorOfPtNode( 174 const int ptNodePos) const { 175 const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos); 176 return BinaryDictionaryBigramsIterator(&mBigramPolicy, bigramsPosition); 177} 178 179int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { 180 if (ptNodePos == NOT_A_DICT_POS) { 181 return NOT_A_DICT_POS; 182 } 183 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 184 if (ptNodeParams.isDeleted()) { 185 return NOT_A_DICT_POS; 186 } 187 return mBuffers->getBigramDictContent()->getBigramListHeadPos( 188 ptNodeParams.getTerminalId()); 189} 190 191bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length, 192 const UnigramProperty *const unigramProperty) { 193 if (!mBuffers->isUpdatable()) { 194 AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); 195 return false; 196 } 197 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 198 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 199 mDictBuffer->getTailPosition()); 200 return false; 201 } 202 if (length > MAX_WORD_LENGTH) { 203 AKLOGE("The word is too long to insert to the dictionary, length: %d", length); 204 return false; 205 } 206 for (const auto &shortcut : unigramProperty->getShortcuts()) { 207 if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 208 AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", 209 shortcut.getTargetCodePoints()->size()); 210 return false; 211 } 212 } 213 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 214 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 215 bool addedNewUnigram = false; 216 int codePointsToAdd[MAX_WORD_LENGTH]; 217 int codePointCountToAdd = length; 218 memmove(codePointsToAdd, word, sizeof(int) * length); 219 if (unigramProperty->representsBeginningOfSentence()) { 220 codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, 221 codePointCountToAdd, MAX_WORD_LENGTH); 222 } 223 if (codePointCountToAdd <= 0) { 224 return false; 225 } 226 if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, 227 unigramProperty, &addedNewUnigram)) { 228 if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { 229 mUnigramCount++; 230 } 231 if (unigramProperty->getShortcuts().size() > 0) { 232 // Add shortcut target. 233 const int wordPos = getTerminalPtNodePositionOfWord(word, length, 234 false /* forceLowerCaseSearch */); 235 if (wordPos == NOT_A_DICT_POS) { 236 AKLOGE("Cannot find terminal PtNode position to add shortcut target."); 237 return false; 238 } 239 for (const auto &shortcut : unigramProperty->getShortcuts()) { 240 if (!mUpdatingHelper.addShortcutTarget(wordPos, 241 shortcut.getTargetCodePoints()->data(), 242 shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { 243 AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " 244 "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), 245 shortcut.getProbability()); 246 return false; 247 } 248 } 249 } 250 return true; 251 } else { 252 return false; 253 } 254} 255 256bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) { 257 if (!mBuffers->isUpdatable()) { 258 AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); 259 return false; 260 } 261 const int ptNodePos = getTerminalPtNodePositionOfWord(word, length, 262 false /* forceLowerCaseSearch */); 263 if (ptNodePos == NOT_A_DICT_POS) { 264 return false; 265 } 266 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 267 if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { 268 AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); 269 return false; 270 } 271 if (!ptNodeParams.representsNonWordInfo()) { 272 mUnigramCount--; 273 } 274 return true; 275} 276 277bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, 278 const BigramProperty *const bigramProperty) { 279 if (!mBuffers->isUpdatable()) { 280 AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); 281 return false; 282 } 283 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 284 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 285 mDictBuffer->getTailPosition()); 286 return false; 287 } 288 if (!prevWordsInfo->isValid()) { 289 AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary."); 290 return false; 291 } 292 if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 293 AKLOGE("The word is too long to insert the ngram to the dictionary. " 294 "length: %d", bigramProperty->getTargetCodePoints()->size()); 295 return false; 296 } 297 int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 298 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 299 false /* tryLowerCaseSearch */); 300 // TODO: Support N-gram. 301 if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 302 if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { 303 const std::vector<UnigramProperty::ShortcutProperty> shortcuts; 304 const UnigramProperty beginningOfSentenceUnigramProperty( 305 true /* representsBeginningOfSentence */, true /* isNotAWord */, 306 false /* isBlacklisted */, MAX_PROBABILITY /* probability */, 307 NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); 308 if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */), 309 prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */), 310 &beginningOfSentenceUnigramProperty)) { 311 AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); 312 return false; 313 } 314 // Refresh Terminal PtNode positions. 315 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 316 false /* tryLowerCaseSearch */); 317 } else { 318 return false; 319 } 320 } 321 const int word1Pos = getTerminalPtNodePositionOfWord( 322 bigramProperty->getTargetCodePoints()->data(), 323 bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); 324 if (word1Pos == NOT_A_DICT_POS) { 325 return false; 326 } 327 bool addedNewBigram = false; 328 if (mUpdatingHelper.addBigramWords(prevWordsPtNodePos[0], word1Pos, bigramProperty, 329 &addedNewBigram)) { 330 if (addedNewBigram) { 331 mBigramCount++; 332 } 333 return true; 334 } else { 335 return false; 336 } 337} 338 339bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, 340 const int *const word, const int length) { 341 if (!mBuffers->isUpdatable()) { 342 AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); 343 return false; 344 } 345 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 346 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 347 mDictBuffer->getTailPosition()); 348 return false; 349 } 350 if (!prevWordsInfo->isValid()) { 351 AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary."); 352 return false; 353 } 354 if (length > MAX_WORD_LENGTH) { 355 AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length); 356 } 357 int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 358 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 359 false /* tryLowerCaseSerch */); 360 // TODO: Support N-gram. 361 if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 362 return false; 363 } 364 const int wordPos = getTerminalPtNodePositionOfWord(word, length, 365 false /* forceLowerCaseSearch */); 366 if (wordPos == NOT_A_DICT_POS) { 367 return false; 368 } 369 if (mUpdatingHelper.removeBigramWords(prevWordsPtNodePos[0], wordPos)) { 370 mBigramCount--; 371 return true; 372 } else { 373 return false; 374 } 375} 376 377bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { 378 if (!mBuffers->isUpdatable()) { 379 AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); 380 return false; 381 } 382 if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { 383 AKLOGE("Cannot flush the dictionary to file."); 384 mIsCorrupted = true; 385 return false; 386 } 387 return true; 388} 389 390bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { 391 if (!mBuffers->isUpdatable()) { 392 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); 393 return false; 394 } 395 if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { 396 AKLOGE("Cannot flush the dictionary to file with GC."); 397 mIsCorrupted = true; 398 return false; 399 } 400 return true; 401} 402 403bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { 404 if (!mBuffers->isUpdatable()) { 405 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); 406 return false; 407 } 408 if (mBuffers->isNearSizeLimit()) { 409 // Additional buffer size is near the limit. 410 return true; 411 } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() 412 > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { 413 // Total extended region size of the trie exceeds the limit. 414 return true; 415 } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS 416 && mDictBuffer->getUsedAdditionalBufferSize() > 0) { 417 // Needs to reduce dictionary size. 418 return true; 419 } else if (mHeaderPolicy->isDecayingDict()) { 420 return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, 421 mHeaderPolicy); 422 } 423 return false; 424} 425 426void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, 427 char *const outResult, const int maxResultLength) { 428 const int compareLength = queryLength + 1 /* terminator */; 429 if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { 430 snprintf(outResult, maxResultLength, "%d", mUnigramCount); 431 } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { 432 snprintf(outResult, maxResultLength, "%d", mBigramCount); 433 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { 434 snprintf(outResult, maxResultLength, "%d", 435 mHeaderPolicy->isDecayingDict() ? 436 ForgettingCurveUtils::getUnigramCountHardLimit( 437 mHeaderPolicy->getMaxUnigramCount()) : 438 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 439 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { 440 snprintf(outResult, maxResultLength, "%d", 441 mHeaderPolicy->isDecayingDict() ? 442 ForgettingCurveUtils::getBigramCountHardLimit( 443 mHeaderPolicy->getMaxBigramCount()) : 444 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 445 } 446} 447 448const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, 449 const int codePointCount) const { 450 const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, 451 false /* forceLowerCaseSearch */); 452 if (ptNodePos == NOT_A_DICT_POS) { 453 AKLOGE("getWordProperty is called for invalid word."); 454 return WordProperty(); 455 } 456 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 457 std::vector<int> codePointVector(ptNodeParams.getCodePoints(), 458 ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); 459 const ProbabilityEntry probabilityEntry = 460 mBuffers->getProbabilityDictContent()->getProbabilityEntry( 461 ptNodeParams.getTerminalId()); 462 const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); 463 // Fetch bigram information. 464 std::vector<BigramProperty> bigrams; 465 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); 466 if (bigramListPos != NOT_A_DICT_POS) { 467 int bigramWord1CodePoints[MAX_WORD_LENGTH]; 468 const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); 469 const TerminalPositionLookupTable *const terminalPositionLookupTable = 470 mBuffers->getTerminalPositionLookupTable(); 471 bool hasNext = true; 472 int readingPos = bigramListPos; 473 while (hasNext) { 474 const BigramEntry bigramEntry = 475 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); 476 hasNext = bigramEntry.hasNext(); 477 const int word1TerminalId = bigramEntry.getTargetTerminalId(); 478 const int word1TerminalPtNodePos = 479 terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); 480 if (word1TerminalPtNodePos == NOT_A_DICT_POS) { 481 continue; 482 } 483 // Word (unigram) probability 484 int word1Probability = NOT_A_PROBABILITY; 485 const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 486 word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, 487 &word1Probability); 488 const std::vector<int> word1(bigramWord1CodePoints, 489 bigramWord1CodePoints + codePointCount); 490 const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); 491 const int probability = bigramEntry.hasHistoricalInfo() ? 492 ForgettingCurveUtils::decodeProbability( 493 bigramEntry.getHistoricalInfo(), mHeaderPolicy) : 494 bigramEntry.getProbability(); 495 bigrams.emplace_back(&word1, probability, 496 historicalInfo->getTimeStamp(), historicalInfo->getLevel(), 497 historicalInfo->getCount()); 498 } 499 } 500 // Fetch shortcut information. 501 std::vector<UnigramProperty::ShortcutProperty> shortcuts; 502 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); 503 if (shortcutPos != NOT_A_DICT_POS) { 504 int shortcutTarget[MAX_WORD_LENGTH]; 505 const ShortcutDictContent *const shortcutDictContent = 506 mBuffers->getShortcutDictContent(); 507 bool hasNext = true; 508 while (hasNext) { 509 int shortcutTargetLength = 0; 510 int shortcutProbability = NOT_A_PROBABILITY; 511 shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, 512 &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); 513 const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); 514 shortcuts.emplace_back(&target, shortcutProbability); 515 } 516 } 517 const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), 518 ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), 519 historicalInfo->getTimeStamp(), historicalInfo->getLevel(), 520 historicalInfo->getCount(), &shortcuts); 521 return WordProperty(&codePointVector, &unigramProperty, &bigrams); 522} 523 524int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, 525 int *const outCodePointCount) { 526 *outCodePointCount = 0; 527 if (token == 0) { 528 mTerminalPtNodePositionsForIteratingWords.clear(); 529 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( 530 &mTerminalPtNodePositionsForIteratingWords); 531 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 532 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 533 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); 534 } 535 const int terminalPtNodePositionsVectorSize = 536 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); 537 if (token < 0 || token >= terminalPtNodePositionsVectorSize) { 538 AKLOGE("Given token %d is invalid.", token); 539 return 0; 540 } 541 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; 542 int unigramProbability = NOT_A_PROBABILITY; 543 *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 544 terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); 545 const int nextToken = token + 1; 546 if (nextToken >= terminalPtNodePositionsVectorSize) { 547 // All words have been iterated. 548 mTerminalPtNodePositionsForIteratingWords.clear(); 549 return 0; 550 } 551 return nextToken; 552} 553 554} // namespace latinime 555