language_model_dict_content.h revision 063f86d40f2cb0d250b2166af8e1cf98ab135f8c
1dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi/*
2dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * Copyright (C) 2014, The Android Open Source Project
3dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi *
4dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License");
5dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * you may not use this file except in compliance with the License.
6dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * You may obtain a copy of the License at
7dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi *
8dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi *     http://www.apache.org/licenses/LICENSE-2.0
9dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi *
10dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software
11dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS,
12dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * See the License for the specific language governing permissions and
14dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi * limitations under the License.
15dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi */
16dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi
17dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H
18dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H
19dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi
20c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi#include <cstdio>
21063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi#include <vector>
22c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi
23dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi#include "defines.h"
2408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
2508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
2608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
27c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi#include "suggest/policyimpl/dictionary/utils/trie_map.h"
28c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi#include "utils/byte_array_view.h"
2908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include "utils/int_array_view.h"
30dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi
31dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanaginamespace latinime {
32dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi
339aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagiclass HeaderPolicy;
349aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi
3508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi/**
3608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi * Class representing language model.
3708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi *
3808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi * This class provides methods to get and store unigram/n-gram probability information and flags.
3908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi */
40dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagiclass LanguageModelDictContent {
41dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi public:
42c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi    LanguageModelDictContent(const ReadWriteByteArrayView trieMapBuffer,
43c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi            const bool hasHistoricalInfo)
4408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            : mTrieMap(trieMapBuffer), mHasHistoricalInfo(hasHistoricalInfo) {}
45c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi
4608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    explicit LanguageModelDictContent(const bool hasHistoricalInfo)
4708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            : mTrieMap(), mHasHistoricalInfo(hasHistoricalInfo) {}
4808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
4908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    bool isNearSizeLimit() const {
5008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        return mTrieMap.isNearSizeLimit();
5108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    }
52c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi
53c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi    bool save(FILE *const file) const;
54dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi
5508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
5608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const LanguageModelDictContent *const originalContent,
5708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            int *const outNgramCount);
5808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
59851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi    ProbabilityEntry getProbabilityEntry(const int wordId) const {
60851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi        return getNgramProbabilityEntry(WordIdArrayView(), wordId);
61851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi    }
62851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi
63851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi    bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) {
64851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi        return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry);
65851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi    }
66851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi
67b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi    bool removeProbabilityEntry(const int wordId) {
68b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi        return removeNgramProbabilityEntry(WordIdArrayView(), wordId);
69b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi    }
70b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi
71851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi    ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds,
72851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi            const int wordId) const;
7308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
74851e0458fe460526b1f953e39a1e406a21ab4647Keisuke Kuroyanagi    bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId,
7508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const ProbabilityEntry *const probabilityEntry);
7608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
77b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi    bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId);
78b4531d861ea740f1bf8e718f312150eb682e3f7bKeisuke Kuroyanagi
799aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi    bool updateAllProbabilityEntries(const HeaderPolicy *const headerPolicy,
809aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi            int *const outEntryCounts) {
81063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        for (int i = 0; i <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++i) {
82063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            outEntryCounts[i] = 0;
83063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        }
849aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi        return updateAllProbabilityEntriesInner(mTrieMap.getRootBitmapEntryIndex(), 0 /* level */,
859aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi                headerPolicy, outEntryCounts);
869aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi    }
879aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi
88063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    // entryCounts should be created by updateAllProbabilityEntries.
89063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    bool truncateEntries(const int *const entryCounts, const int *const maxEntryCounts,
90063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            const HeaderPolicy *const headerPolicy);
91063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
92dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi private:
93dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi    DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent);
94c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi
95063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    class EntryInfoToTurncate {
96063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi     public:
97063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        class Comparator {
98063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi         public:
99063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            bool operator()(const EntryInfoToTurncate &left,
100063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi                    const EntryInfoToTurncate &right) const;
101063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi         private:
102063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            DISALLOW_ASSIGNMENT_OPERATOR(Comparator);
103063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        };
104063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
105063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        EntryInfoToTurncate(const int probability, const int timestamp, const int key,
106063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi                const int entryLevel, const int *const prevWordIds);
107063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
108063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        int mProbability;
109063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        int mTimestamp;
110063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        int mKey;
111063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        int mEntryLevel;
112063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
113063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
114063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi     private:
115063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi        DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate);
116063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    };
117063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi
118c4696b2eb6b25eea4d5c869683104ab99aec0421Keisuke Kuroyanagi    TrieMap mTrieMap;
11908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    const bool mHasHistoricalInfo;
12008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
12108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
12208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex,
12308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            int *const outNgramCount);
1249a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi    int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds);
12503dc44f543795040a092723085fac1209103b7bdKeisuke Kuroyanagi    int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const;
1269aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi    bool updateAllProbabilityEntriesInner(const int bitmapEntryIndex, const int level,
1279aa6699107de4da356b8eb89fb3ca38100e19c9dKeisuke Kuroyanagi            const HeaderPolicy *const headerPolicy, int *const outEntryCounts);
128063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy,
129063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            const int maxEntryCount, const int targetLevel);
130063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi    bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel,
131063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
132063f86d40f2cb0d250b2166af8e1cf98ab135f8cKeisuke Kuroyanagi            std::vector<EntryInfoToTurncate> *const outEntryInfo) const;
133dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi};
134dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi} // namespace latinime
135dc3856d7589aa3cf3dcfdee8360fa48a85983273Keisuke Kuroyanagi#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */
136