12fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa/*
22fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Copyright (C) 2013, The Android Open Source Project
32fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
42fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Licensed under the Apache License, Version 2.0 (the "License");
52fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * you may not use this file except in compliance with the License.
62fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * You may obtain a copy of the License at
72fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
82fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *     http://www.apache.org/licenses/LICENSE-2.0
92fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Unless required by applicable law or agreed to in writing, software
112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * distributed under the License is distributed on an "AS IS" BASIS,
122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * See the License for the specific language governing permissions and
142fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * limitations under the License.
152fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */
162fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
172fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#ifndef LATINIME_VER4_PATRICIA_TRIE_POLICY_H
182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#define LATINIME_VER4_PATRICIA_TRIE_POLICY_H
192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
20941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi#include <vector>
21941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi
222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "defines.h"
2388bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/header/header_policy.h"
2488bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
2588bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
2688bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
2788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_dict_buffers.h"
2888bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
2988bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
3088bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
3188bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
3288bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/buffer_with_extendable_buffer.h"
3388bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/utils/entry_counters.h"
346ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi#include "utils/int_array_view.h"
352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasanamespace latinime {
372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNode;
392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass DicNodeVector;
402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
417d475003572c9c2902f5918bad524f4ac233e629Keisuke Kuroyanagi// Word id = Artificial id that is stored in the PtNode looked up by the word.
422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
432fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public:
444ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi    Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
454ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi            : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()),
464ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi              mDictBuffer(mBuffers->getWritableTrieBuffer()),
474ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi              mShortcutPolicy(mBuffers->getMutableShortcutDictContent(),
484ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi                      mBuffers->getTerminalPositionLookupTable()),
49cb4f544198e0592e3e4bb96f1592bc0bd2beb6edKeisuke Kuroyanagi              mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer),
505400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi              mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader,
515400701908262c929a77141cc84567646053d032Keisuke Kuroyanagi                      &mShortcutPolicy),
522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mWritingHelper(mBuffers.get()),
5478212a6d3de2c1fdaa394c58a16cbdee3ad5d046Keisuke Kuroyanagi              mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()),
55b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi              mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};
562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    AK_FORCE_INLINE int getRootPosition() const {
582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return 0;
592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
602fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
612fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    void createAndGetAllChildDicNodes(const DicNode *const dicNode,
622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            DicNodeVector *const childDicNodes) const;
632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
6465a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi    int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
6565a7ccfa003f898064f09c059eb236cf8f82cd89Keisuke Kuroyanagi            int *const outCodePoints) const;
662fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
6789a003b12b5e2408b908a8afed498b0425e2c1c8Keisuke Kuroyanagi    int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
69537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
70537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi            const int wordId, MultiBigramMap *const multiBigramMap) const;
719f8da0f833c7aab226ed0b93ab6c546380b068bbKeisuke Kuroyanagi
72537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    // TODO: Remove
73537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    int getProbability(const int unigramProbability, const int bigramProbability) const {
74537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi        // Not used.
75537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi        return NOT_A_PROBABILITY;
76537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    }
772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
78537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
80537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi    void iterateNgramEntries(const WordIdArrayView prevWordIds,
81537f6eea8a8d56fe532913a37f4dbff4b3d490afKeisuke Kuroyanagi            NgramListener *const listener) const;
822d57b3339ad5b4bbf0939858c36c7daf5e38a4cbKeisuke Kuroyanagi
83ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi    BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
842fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
852fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mHeaderPolicy;
872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
896ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool addUnigramEntry(const CodePointArrayView wordCodePoints,
90793124855de9dabb9e85b1e06619716649f087c5Keisuke Kuroyanagi            const UnigramProperty *const unigramProperty);
912fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
926ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
93f12985245c962779f0b422b3072cffe533b4edfbKeisuke Kuroyanagi
9488bb28c132d87f15a52e9a0b8a45950f39eb19adKeisuke Kuroyanagi    bool addNgramEntry(const NgramProperty *const ngramProperty);
952fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
9672e2383d11cf09735b378dcedd20c9fc43da1f12Keisuke Kuroyanagi    bool removeNgramEntry(const NgramContext *const ngramContext,
976ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi            const CodePointArrayView wordCodePoints);
982fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
9972e2383d11cf09735b378dcedd20c9fc43da1f12Keisuke Kuroyanagi    bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
10029777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi            const CodePointArrayView wordCodePoints, const bool isValidWord,
10129777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi            const HistoricalInfo historicalInfo);
10229777e3a8a419c7c897637372c908566c6490e90Keisuke Kuroyanagi
103dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi    bool flush(const char *const filePath);
1042fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
105dfca51726e9dc9a35f462dee39331823eafa07c9Keisuke Kuroyanagi    bool flushWithGC(const char *const filePath);
1062fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1072fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    bool needsToRunGC(const bool mindsBlockByGC) const;
1082fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1092fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    void getProperty(const char *const query, const int queryLength, char *const outResult,
1102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            const int maxResultLength);
1112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1126ae4d79d81aa7aea5529d95bb3eb960a273ef411Keisuke Kuroyanagi    const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
1132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
114f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi    int getNextWordAndNextToken(const int token, int *const outCodePoints,
115f7322b166b88f72b19509d8416700d4ec8ea7753Keisuke Kuroyanagi            int *const outCodePointCount);
11638f341a2a53a04ce4195a0cb99fcb6e71203dec0Keisuke Kuroyanagi
117b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    bool isCorrupted() const {
118b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi        return mIsCorrupted;
119b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    }
120b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi
1212fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa private:
1222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy);
1232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1242fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const UNIGRAM_COUNT_QUERY;
1252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const BIGRAM_COUNT_QUERY;
1262fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const MAX_UNIGRAM_COUNT_QUERY;
1272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const char *const MAX_BIGRAM_COUNT_QUERY;
1282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // When the dictionary size is near the maximum size, we have to refuse dynamic operations to
1292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // prevent the dictionary from overflowing.
1302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
1312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
1322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1334ce480d5ce2d47f607448ce439aaf2cefba1bdd8Keisuke Kuroyanagi    const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
1342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const HeaderPolicy *const mHeaderPolicy;
1352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    BufferWithExtendableBuffer *const mDictBuffer;
1362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4ShortcutListPolicy mShortcutPolicy;
1372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieNodeReader mNodeReader;
1381d6afa179cd31010efe28f1c3e17698d6be79cabKeisuke Kuroyanagi    Ver4PtNodeArrayReader mPtNodeArrayReader;
1392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieNodeWriter mNodeWriter;
1402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    DynamicPtUpdatingHelper mUpdatingHelper;
1412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    Ver4PatriciaTrieWritingHelper mWritingHelper;
142e8750d970eed61b9239d8b2fa19648b8457696c1Keisuke Kuroyanagi    MutableEntryCounters mEntryCounters;
143941734695b9eeb59135db737e4b153c45e88247aKeisuke Kuroyanagi    std::vector<int> mTerminalPtNodePositionsForIteratingWords;
144b96012acef7c7add578b95efa17f3b6132220fd6Keisuke Kuroyanagi    mutable bool mIsCorrupted;
145847a026cd81f52f7aee0d5fd75c14a3f8619ebafKeisuke Kuroyanagi
146ac983b13a9ee97d9a1a0ecd4a12dff5b2e2552e5Keisuke Kuroyanagi    int getShortcutPositionOfWord(const int wordId) const;
1472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa};
1482fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} // namespace latinime
1492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
150