ver4_patricia_trie_policy.h revision cb4f544198e0592e3e4bb96f1592bc0bd2beb6ed
12fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa/*
22fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Copyright (C) 2013, The Android Open Source Project
32fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
42fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Licensed under the Apache License, Version 2.0 (the "License");
52fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * you may not use this file except in compliance with the License.
62fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * You may obtain a copy of the License at
72fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
82fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *     http://www.apache.org/licenses/LICENSE-2.0
92fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Unless required by applicable law or agreed to in writing, software
112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * distributed under the License is distributed on an "AS IS" BASIS,
122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * See the License for the specific language governing permissions and
142fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * limitations under the License.
152fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */
162fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
172fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
20941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi#include <vector>
21941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi
222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "defines.h"
232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/header/header_policy.h"
252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
266e4b674f83e0c287e00bfe6546db2a1f93daf5f0Keisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
311d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h"
322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
336ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi#include "utils/int_array_view.h"
342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasanamespace latinime {
362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNode;
382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNodeVector;
392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
407d475003572c9c2902f5918bad524f4ac233e629Keisuke Kuroyanagi// Word id = Artificial id that is stored in the PtNode looked up by the word.
412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public:
434ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi    Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
444ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi            : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()),
454ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi              mDictBuffer(mBuffers->getWritableTrieBuffer()),
464ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi              mShortcutPolicy(mBuffers->getMutableShortcutDictContent(),
474ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi                      mBuffers->getTerminalPositionLookupTable()),
48cb4f544198e0592e3e4bb96f1592bc0bd2beb6edKeisuke Kuroyanagi              mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer),
4957816c7a8bac1a47913da7a503ece2b5dd7cc0fcKeisuke Kuroyanagi              mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader,
50fe395232d69df0887863c1cbabe63def2586d29eKeisuke Kuroyanagi                      &mPtNodeArrayReader, &mShortcutPolicy),
512fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mWritingHelper(mBuffers.get()),
532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mUnigramCount(mHeaderPolicy->getUnigramCount()),
54941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi              mBigramCount(mHeaderPolicy->getBigramCount()),
55b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi              mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};
562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    AK_FORCE_INLINE int getRootPosition() const {
582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return 0;
592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
602fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
612fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    void createAndGetAllChildDicNodes(const DicNode *const dicNode,
622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            DicNodeVector *const childDicNodes) const;
632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
6465a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi    int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
6565a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi            int *const outCodePoints) const;
662fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
6789a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi    int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
69537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
70537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi            const int wordId, MultiBigramMap *const multiBigramMap) const;
719f8da0f833c7aab226ed0b93ab6c546380b068bbKeisuke Kuroyanagi
72537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    // TODO: Remove
73537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    int getProbability(const int unigramProbability, const int bigramProbability) const {
74537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi        // Not used.
75537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi        return NOT_A_PROBABILITY;
76537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    }
772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
78537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
80537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    void iterateNgramEntries(const WordIdArrayView prevWordIds,
81537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi            NgramListener *const listener) const;
822d57b3339ad5b4bbf0939858c36c7daf5e38a4cbKeisuke Kuroyanagi
83ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi    BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
842fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
852fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mHeaderPolicy;
872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
896ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool addUnigramEntry(const CodePointArrayView wordCodePoints,
90793124855de9dabb9e85b1e06619716649f087c5Keisuke Kuroyanagi            const UnigramProperty *const unigramProperty);
912fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
926ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
93f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi
949f8c9a0161924f515c5ff9617db2317cdc1d01e2Keisuke Kuroyanagi    bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
95620a05ae59ec9f7be39557094fc306c51c712ca1Keisuke Kuroyanagi            const BigramProperty *const bigramProperty);
962fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
976ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
986ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi            const CodePointArrayView wordCodePoints);
992fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
100dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi    bool flush(const char *const filePath);
1012fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
102dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi    bool flushWithGC(const char *const filePath);
1032fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1042fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    bool needsToRunGC(const bool mindsBlockByGC) const;
1052fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1062fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    void getProperty(const char *const query, const int queryLength, char *const outResult,
1072fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            const int maxResultLength);
1082fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1096ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
1102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
111f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi    int getNextWordAndNextToken(const int token, int *const outCodePoints,
112f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi            int *const outCodePointCount);
11338f341a2a53a04ce4195a0cb99fcb6e71203dec0Keisuke Kuroyanagi
114b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    bool isCorrupted() const {
115b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi        return mIsCorrupted;
116b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    }
117b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi
1182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa private:
1192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy);
1202fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1212fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const UNIGRAM_COUNT_QUERY;
1222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const BIGRAM_COUNT_QUERY;
1232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const MAX_UNIGRAM_COUNT_QUERY;
1242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const MAX_BIGRAM_COUNT_QUERY;
1252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // When the dictionary size is near the maximum size, we have to refuse dynamic operations to
1262fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // prevent the dictionary from overflowing.
1272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
1282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
1292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1304ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi    const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
1312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const HeaderPolicy *const mHeaderPolicy;
1322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    BufferWithExtendableBuffer *const mDictBuffer;
1332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4ShortcutListPolicy mShortcutPolicy;
1342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieNodeReader mNodeReader;
1351d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi    Ver4PtNodeArrayReader mPtNodeArrayReader;
1362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieNodeWriter mNodeWriter;
1372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    DynamicPtUpdatingHelper mUpdatingHelper;
1382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieWritingHelper mWritingHelper;
1392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int mUnigramCount;
1402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int mBigramCount;
141941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi    std::vector<int> mTerminalPtNodePositionsForIteratingWords;
142b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    mutable bool mIsCorrupted;
143847a026cd81f52f7aee0d5fd75c14a3f8619ebafKeisuke Kuroyanagi
144ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi    int getShortcutPositionOfWord(const int wordId) const;
1452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa};
1462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} // namespace latinime
1472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
148