ver4_patricia_trie_policy.cpp revision 96d47fe7457ff1dbea4696a5e0edec2801610d47
1/* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" 18 19#include <vector> 20 21#include "suggest/core/dicnode/dic_node.h" 22#include "suggest/core/dicnode/dic_node_vector.h" 23#include "suggest/core/dictionary/property/bigram_property.h" 24#include "suggest/core/dictionary/property/unigram_property.h" 25#include "suggest/core/dictionary/property/word_property.h" 26#include "suggest/core/session/prev_words_info.h" 27#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" 28#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" 29#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" 30#include "suggest/policyimpl/dictionary/utils/probability_utils.h" 31 32namespace latinime { 33 34// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and 35// BinaryDictionaryDecayingTests. 36const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; 37const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; 38const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; 39const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; 40const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; 41const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = 42 Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; 43 44void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, 45 DicNodeVector *const childDicNodes) const { 46 if (!dicNode->hasChildren()) { 47 return; 48 } 49 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 50 readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); 51 while (!readingHelper.isEnd()) { 52 const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); 53 if (!ptNodeParams.isValid()) { 54 break; 55 } 56 bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); 57 if (isTerminal && mHeaderPolicy->isDecayingDict()) { 58 // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose 59 // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a 60 // valid terminal DicNode. 61 isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; 62 } 63 readingHelper.readNextSiblingNode(ptNodeParams); 64 if (ptNodeParams.representsNonWordInfo()) { 65 // Skip PtNodes that represent non-word information. 66 continue; 67 } 68 childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), 69 ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, 70 ptNodeParams.hasChildren(), 71 ptNodeParams.isBlacklisted() 72 || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, 73 ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); 74 } 75 if (readingHelper.isError()) { 76 mIsCorrupted = true; 77 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 78 } 79} 80 81int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( 82 const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, 83 int *const outUnigramProbability) const { 84 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 85 readingHelper.initWithPtNodePos(ptNodePos); 86 const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( 87 maxCodePointCount, outCodePoints, outUnigramProbability); 88 if (readingHelper.isError()) { 89 mIsCorrupted = true; 90 AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); 91 } 92 return codePointCount; 93} 94 95int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, 96 const int length, const bool forceLowerCaseSearch) const { 97 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 98 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 99 const int ptNodePos = 100 readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); 101 if (readingHelper.isError()) { 102 mIsCorrupted = true; 103 AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); 104 } 105 return ptNodePos; 106} 107 108int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, 109 const int bigramProbability) const { 110 if (mHeaderPolicy->isDecayingDict()) { 111 // Both probabilities are encoded. Decode them and get probability. 112 return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); 113 } else { 114 if (unigramProbability == NOT_A_PROBABILITY) { 115 return NOT_A_PROBABILITY; 116 } else if (bigramProbability == NOT_A_PROBABILITY) { 117 return ProbabilityUtils::backoff(unigramProbability); 118 } else { 119 return bigramProbability; 120 } 121 } 122} 123 124int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const PrevWordsInfo *const prevWordsInfo, 125 const int ptNodePos) const { 126 if (prevWordsInfo) { 127 // TODO: Return probability using prevWordsInfo. 128 return NOT_A_PROBABILITY; 129 } 130 if (ptNodePos == NOT_A_DICT_POS) { 131 return NOT_A_PROBABILITY; 132 } 133 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 134 if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { 135 return NOT_A_PROBABILITY; 136 } 137 return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); 138} 139 140int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { 141 if (ptNodePos == NOT_A_DICT_POS) { 142 return NOT_A_DICT_POS; 143 } 144 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 145 if (ptNodeParams.isDeleted()) { 146 return NOT_A_DICT_POS; 147 } 148 return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( 149 ptNodeParams.getTerminalId()); 150} 151 152BinaryDictionaryBigramsIterator Ver4PatriciaTriePolicy::getBigramsIteratorOfPtNode( 153 const int ptNodePos) const { 154 const int bigramsPosition = getBigramsPositionOfPtNode(ptNodePos); 155 return BinaryDictionaryBigramsIterator(&mBigramPolicy, bigramsPosition); 156} 157 158int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { 159 if (ptNodePos == NOT_A_DICT_POS) { 160 return NOT_A_DICT_POS; 161 } 162 const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); 163 if (ptNodeParams.isDeleted()) { 164 return NOT_A_DICT_POS; 165 } 166 return mBuffers->getBigramDictContent()->getBigramListHeadPos( 167 ptNodeParams.getTerminalId()); 168} 169 170bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length, 171 const UnigramProperty *const unigramProperty) { 172 if (!mBuffers->isUpdatable()) { 173 AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); 174 return false; 175 } 176 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 177 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 178 mDictBuffer->getTailPosition()); 179 return false; 180 } 181 if (length > MAX_WORD_LENGTH) { 182 AKLOGE("The word is too long to insert to the dictionary, length: %d", length); 183 return false; 184 } 185 for (const auto &shortcut : unigramProperty->getShortcuts()) { 186 if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 187 AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", 188 shortcut.getTargetCodePoints()->size()); 189 return false; 190 } 191 } 192 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 193 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 194 bool addedNewUnigram = false; 195 int codePointsToAdd[MAX_WORD_LENGTH]; 196 int codePointCountToAdd = length; 197 memmove(codePointsToAdd, word, sizeof(int) * length); 198 if (unigramProperty->representsBeginningOfSentence()) { 199 codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, 200 codePointCountToAdd, MAX_WORD_LENGTH); 201 } 202 if (codePointCountToAdd <= 0) { 203 return false; 204 } 205 if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, 206 unigramProperty, &addedNewUnigram)) { 207 if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { 208 mUnigramCount++; 209 } 210 if (unigramProperty->getShortcuts().size() > 0) { 211 // Add shortcut target. 212 const int wordPos = getTerminalPtNodePositionOfWord(word, length, 213 false /* forceLowerCaseSearch */); 214 if (wordPos == NOT_A_DICT_POS) { 215 AKLOGE("Cannot find terminal PtNode position to add shortcut target."); 216 return false; 217 } 218 for (const auto &shortcut : unigramProperty->getShortcuts()) { 219 if (!mUpdatingHelper.addShortcutTarget(wordPos, 220 shortcut.getTargetCodePoints()->data(), 221 shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { 222 AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " 223 "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), 224 shortcut.getProbability()); 225 return false; 226 } 227 } 228 } 229 return true; 230 } else { 231 return false; 232 } 233} 234 235bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) { 236 if (!mBuffers->isUpdatable()) { 237 AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); 238 return false; 239 } 240 const int ptNodePos = getTerminalPtNodePositionOfWord(word, length, 241 false /* forceLowerCaseSearch */); 242 if (ptNodePos == NOT_A_DICT_POS) { 243 return false; 244 } 245 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 246 if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { 247 AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); 248 return false; 249 } 250 if (!ptNodeParams.representsNonWordInfo()) { 251 mUnigramCount--; 252 } 253 return true; 254} 255 256bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, 257 const BigramProperty *const bigramProperty) { 258 if (!mBuffers->isUpdatable()) { 259 AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); 260 return false; 261 } 262 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 263 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 264 mDictBuffer->getTailPosition()); 265 return false; 266 } 267 if (!prevWordsInfo->isValid()) { 268 AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary."); 269 return false; 270 } 271 if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { 272 AKLOGE("The word is too long to insert the ngram to the dictionary. " 273 "length: %d", bigramProperty->getTargetCodePoints()->size()); 274 return false; 275 } 276 int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 277 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 278 false /* tryLowerCaseSearch */); 279 // TODO: Support N-gram. 280 if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 281 if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { 282 const std::vector<UnigramProperty::ShortcutProperty> shortcuts; 283 const UnigramProperty beginningOfSentenceUnigramProperty( 284 true /* representsBeginningOfSentence */, true /* isNotAWord */, 285 false /* isBlacklisted */, MAX_PROBABILITY /* probability */, 286 NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); 287 if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */), 288 prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */), 289 &beginningOfSentenceUnigramProperty)) { 290 AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); 291 return false; 292 } 293 // Refresh Terminal PtNode positions. 294 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 295 false /* tryLowerCaseSearch */); 296 } else { 297 return false; 298 } 299 } 300 const int word1Pos = getTerminalPtNodePositionOfWord( 301 bigramProperty->getTargetCodePoints()->data(), 302 bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); 303 if (word1Pos == NOT_A_DICT_POS) { 304 return false; 305 } 306 bool addedNewBigram = false; 307 if (mUpdatingHelper.addBigramWords(prevWordsPtNodePos[0], word1Pos, bigramProperty, 308 &addedNewBigram)) { 309 if (addedNewBigram) { 310 mBigramCount++; 311 } 312 return true; 313 } else { 314 return false; 315 } 316} 317 318bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, 319 const int *const word, const int length) { 320 if (!mBuffers->isUpdatable()) { 321 AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); 322 return false; 323 } 324 if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { 325 AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", 326 mDictBuffer->getTailPosition()); 327 return false; 328 } 329 if (!prevWordsInfo->isValid()) { 330 AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary."); 331 return false; 332 } 333 if (length > MAX_WORD_LENGTH) { 334 AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length); 335 } 336 int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; 337 prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, 338 false /* tryLowerCaseSerch */); 339 // TODO: Support N-gram. 340 if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 341 return false; 342 } 343 const int wordPos = getTerminalPtNodePositionOfWord(word, length, 344 false /* forceLowerCaseSearch */); 345 if (wordPos == NOT_A_DICT_POS) { 346 return false; 347 } 348 if (mUpdatingHelper.removeBigramWords(prevWordsPtNodePos[0], wordPos)) { 349 mBigramCount--; 350 return true; 351 } else { 352 return false; 353 } 354} 355 356bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { 357 if (!mBuffers->isUpdatable()) { 358 AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); 359 return false; 360 } 361 if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { 362 AKLOGE("Cannot flush the dictionary to file."); 363 mIsCorrupted = true; 364 return false; 365 } 366 return true; 367} 368 369bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { 370 if (!mBuffers->isUpdatable()) { 371 AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); 372 return false; 373 } 374 if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { 375 AKLOGE("Cannot flush the dictionary to file with GC."); 376 mIsCorrupted = true; 377 return false; 378 } 379 return true; 380} 381 382bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { 383 if (!mBuffers->isUpdatable()) { 384 AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); 385 return false; 386 } 387 if (mBuffers->isNearSizeLimit()) { 388 // Additional buffer size is near the limit. 389 return true; 390 } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() 391 > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { 392 // Total extended region size of the trie exceeds the limit. 393 return true; 394 } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS 395 && mDictBuffer->getUsedAdditionalBufferSize() > 0) { 396 // Needs to reduce dictionary size. 397 return true; 398 } else if (mHeaderPolicy->isDecayingDict()) { 399 return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, 400 mHeaderPolicy); 401 } 402 return false; 403} 404 405void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, 406 char *const outResult, const int maxResultLength) { 407 const int compareLength = queryLength + 1 /* terminator */; 408 if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { 409 snprintf(outResult, maxResultLength, "%d", mUnigramCount); 410 } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { 411 snprintf(outResult, maxResultLength, "%d", mBigramCount); 412 } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { 413 snprintf(outResult, maxResultLength, "%d", 414 mHeaderPolicy->isDecayingDict() ? 415 ForgettingCurveUtils::getUnigramCountHardLimit( 416 mHeaderPolicy->getMaxUnigramCount()) : 417 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 418 } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { 419 snprintf(outResult, maxResultLength, "%d", 420 mHeaderPolicy->isDecayingDict() ? 421 ForgettingCurveUtils::getBigramCountHardLimit( 422 mHeaderPolicy->getMaxBigramCount()) : 423 static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); 424 } 425} 426 427const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, 428 const int codePointCount) const { 429 const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, 430 false /* forceLowerCaseSearch */); 431 if (ptNodePos == NOT_A_DICT_POS) { 432 AKLOGE("getWordProperty is called for invalid word."); 433 return WordProperty(); 434 } 435 const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); 436 std::vector<int> codePointVector(ptNodeParams.getCodePoints(), 437 ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); 438 const ProbabilityEntry probabilityEntry = 439 mBuffers->getProbabilityDictContent()->getProbabilityEntry( 440 ptNodeParams.getTerminalId()); 441 const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); 442 // Fetch bigram information. 443 std::vector<BigramProperty> bigrams; 444 const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); 445 if (bigramListPos != NOT_A_DICT_POS) { 446 int bigramWord1CodePoints[MAX_WORD_LENGTH]; 447 const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); 448 const TerminalPositionLookupTable *const terminalPositionLookupTable = 449 mBuffers->getTerminalPositionLookupTable(); 450 bool hasNext = true; 451 int readingPos = bigramListPos; 452 while (hasNext) { 453 const BigramEntry bigramEntry = 454 bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); 455 hasNext = bigramEntry.hasNext(); 456 const int word1TerminalId = bigramEntry.getTargetTerminalId(); 457 const int word1TerminalPtNodePos = 458 terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); 459 if (word1TerminalPtNodePos == NOT_A_DICT_POS) { 460 continue; 461 } 462 // Word (unigram) probability 463 int word1Probability = NOT_A_PROBABILITY; 464 const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 465 word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, 466 &word1Probability); 467 const std::vector<int> word1(bigramWord1CodePoints, 468 bigramWord1CodePoints + codePointCount); 469 const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); 470 const int probability = bigramEntry.hasHistoricalInfo() ? 471 ForgettingCurveUtils::decodeProbability( 472 bigramEntry.getHistoricalInfo(), mHeaderPolicy) : 473 bigramEntry.getProbability(); 474 bigrams.emplace_back(&word1, probability, 475 historicalInfo->getTimeStamp(), historicalInfo->getLevel(), 476 historicalInfo->getCount()); 477 } 478 } 479 // Fetch shortcut information. 480 std::vector<UnigramProperty::ShortcutProperty> shortcuts; 481 int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); 482 if (shortcutPos != NOT_A_DICT_POS) { 483 int shortcutTarget[MAX_WORD_LENGTH]; 484 const ShortcutDictContent *const shortcutDictContent = 485 mBuffers->getShortcutDictContent(); 486 bool hasNext = true; 487 while (hasNext) { 488 int shortcutTargetLength = 0; 489 int shortcutProbability = NOT_A_PROBABILITY; 490 shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, 491 &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); 492 const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); 493 shortcuts.emplace_back(&target, shortcutProbability); 494 } 495 } 496 const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), 497 ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), 498 historicalInfo->getTimeStamp(), historicalInfo->getLevel(), 499 historicalInfo->getCount(), &shortcuts); 500 return WordProperty(&codePointVector, &unigramProperty, &bigrams); 501} 502 503int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, 504 int *const outCodePointCount) { 505 *outCodePointCount = 0; 506 if (token == 0) { 507 mTerminalPtNodePositionsForIteratingWords.clear(); 508 DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( 509 &mTerminalPtNodePositionsForIteratingWords); 510 DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); 511 readingHelper.initWithPtNodeArrayPos(getRootPosition()); 512 readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); 513 } 514 const int terminalPtNodePositionsVectorSize = 515 static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); 516 if (token < 0 || token >= terminalPtNodePositionsVectorSize) { 517 AKLOGE("Given token %d is invalid.", token); 518 return 0; 519 } 520 const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; 521 int unigramProbability = NOT_A_PROBABILITY; 522 *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( 523 terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); 524 const int nextToken = token + 1; 525 if (nextToken >= terminalPtNodePositionsVectorSize) { 526 // All words have been iterated. 527 mTerminalPtNodePositionsForIteratingWords.clear(); 528 return 0; 529 } 530 return nextToken; 531} 532 533} // namespace latinime 534