1/*
2 * Copyright (C) 2013, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef LATINIME_HEADER_POLICY_H
18#define LATINIME_HEADER_POLICY_H
19
20#include <cstdint>
21
22#include "defines.h"
23#include "dictionary/header/header_read_write_utils.h"
24#include "dictionary/interface/dictionary_header_structure_policy.h"
25#include "dictionary/utils/entry_counters.h"
26#include "dictionary/utils/format_utils.h"
27#include "utils/char_utils.h"
28#include "utils/time_keeper.h"
29
30namespace latinime {
31
32class HeaderPolicy : public DictionaryHeaderStructurePolicy {
33 public:
34    // Reads information from existing dictionary buffer.
35    HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion)
36            : mDictFormatVersion(formatVersion),
37              mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
38              mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
39              mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
40              mLocale(readLocale()),
41              mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
42              mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
43              mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
44                      IS_DECAYING_DICT_KEY, false /* defaultValue */)),
45              mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
46                      DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
47              mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
48                      LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
49              mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
50              mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
51                      EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
52              mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
53                      &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
54              mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
55                      &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
56                      DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
57              mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
58
59    // Constructs header information using an attribute map.
60    HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
61            const std::vector<int> &locale,
62            const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap)
63            : mDictFormatVersion(dictFormatVersion),
64              mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
65                      attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale),
66              mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
67              mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
68              mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
69                      IS_DECAYING_DICT_KEY, false /* defaultValue */)),
70              mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
71                      DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
72              mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
73                      DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
74              mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
75              mExtendedRegionSize(0),
76              mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
77                      &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
78              mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
79                      &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
80                      DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
81              mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
82
83    // Copy header information
84    HeaderPolicy(const HeaderPolicy *const headerPolicy)
85            : mDictFormatVersion(headerPolicy->mDictFormatVersion),
86              mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize),
87              mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale),
88              mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier),
89              mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing),
90              mIsDecayingDict(headerPolicy->mIsDecayingDict),
91              mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
92              mNgramCounts(headerPolicy->mNgramCounts),
93              mMaxNgramCounts(headerPolicy->mMaxNgramCounts),
94              mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
95              mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
96              mForgettingCurveProbabilityValuesTableId(
97                      headerPolicy->mForgettingCurveProbabilityValuesTableId),
98              mCodePointTable(headerPolicy->mCodePointTable) {}
99
100    // Temporary dummy header.
101    HeaderPolicy()
102            : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
103              mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
104              mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
105              mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(),
106              mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
107              mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {}
108
109    ~HeaderPolicy() {}
110
111    virtual int getFormatVersionNumber() const {
112        // Conceptually this converts the symbolic value we use in the code into the
113        // hardcoded of the bytes in the file. But we want the constants to be the
114        // same so we use them for both here.
115        switch (mDictFormatVersion) {
116            case FormatUtils::VERSION_2:
117            case FormatUtils::VERSION_201:
118                AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
119                return FormatUtils::UNKNOWN_VERSION;
120            case FormatUtils::VERSION_202:
121                return FormatUtils::VERSION_202;
122            case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
123                return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
124            case FormatUtils::VERSION_402:
125                return FormatUtils::VERSION_402;
126            case FormatUtils::VERSION_403:
127                return FormatUtils::VERSION_403;
128            default:
129                return FormatUtils::UNKNOWN_VERSION;
130        }
131    }
132
133    AK_FORCE_INLINE bool isValid() const {
134        // Decaying dictionary must have historical information.
135        if (!mIsDecayingDict) {
136            return true;
137        }
138        if (mHasHistoricalInfoOfWords) {
139            return true;
140        } else {
141            return false;
142        }
143    }
144
145    AK_FORCE_INLINE int getSize() const {
146        return mSize;
147    }
148
149    AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
150        return mMultiWordCostMultiplier;
151    }
152
153    AK_FORCE_INLINE bool isDecayingDict() const {
154        return mIsDecayingDict;
155    }
156
157    AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
158        return mRequiresGermanUmlautProcessing;
159    }
160
161    AK_FORCE_INLINE int getDate() const {
162        return mDate;
163    }
164
165    AK_FORCE_INLINE int getLastDecayedTime() const {
166        return mLastDecayedTime;
167    }
168
169    AK_FORCE_INLINE const EntryCounts &getNgramCounts() const {
170        return mNgramCounts;
171    }
172
173    AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const {
174        return mMaxNgramCounts;
175    }
176
177    AK_FORCE_INLINE int getExtendedRegionSize() const {
178        return mExtendedRegionSize;
179    }
180
181    AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
182        return mHasHistoricalInfoOfWords;
183    }
184
185    AK_FORCE_INLINE bool shouldBoostExactMatches() const {
186        // TODO: Investigate better ways to handle exact matches for personalized dictionaries.
187        return !isDecayingDict();
188    }
189
190    const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const {
191        return &mAttributeMap;
192    }
193
194    AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
195        return mForgettingCurveProbabilityValuesTableId;
196    }
197
198    void readHeaderValueOrQuestionMark(const char *const key,
199            int *outValue, int outValueSize) const;
200
201    bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
202            const EntryCounts &entryCounts, const int extendedRegionSize,
203            BufferWithExtendableBuffer *const outBuffer) const;
204
205    void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts,
206            const int extendedRegionSize,
207            DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;
208
209    AK_FORCE_INLINE const std::vector<int> *getLocale() const {
210        return &mLocale;
211    }
212
213    bool supportsBeginningOfSentence() const {
214        return mDictFormatVersion >= FormatUtils::VERSION_402;
215    }
216
217    const int *getCodePointTable() const {
218        return mCodePointTable;
219    }
220
221 private:
222    DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
223
224    static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
225    static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
226    static const char *const IS_DECAYING_DICT_KEY;
227    static const char *const DATE_KEY;
228    static const char *const LAST_DECAYED_TIME_KEY;
229    static const char *const NGRAM_COUNT_KEYS[];
230    static const char *const MAX_NGRAM_COUNT_KEYS[];
231    static const int DEFAULT_MAX_NGRAM_COUNTS[];
232    static const char *const EXTENDED_REGION_SIZE_KEY;
233    static const char *const HAS_HISTORICAL_INFO_KEY;
234    static const char *const LOCALE_KEY;
235    static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
236    static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
237    static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
238    static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
239    static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
240    static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
241
242    const FormatUtils::FORMAT_VERSION mDictFormatVersion;
243    const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
244    const int mSize;
245    DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap;
246    const std::vector<int> mLocale;
247    const float mMultiWordCostMultiplier;
248    const bool mRequiresGermanUmlautProcessing;
249    const bool mIsDecayingDict;
250    const int mDate;
251    const int mLastDecayedTime;
252    const EntryCounts mNgramCounts;
253    const EntryCounts mMaxNgramCounts;
254    const int mExtendedRegionSize;
255    const bool mHasHistoricalInfoOfWords;
256    const int mForgettingCurveProbabilityValuesTableId;
257    const int *const mCodePointTable;
258
259    const std::vector<int> readLocale() const;
260    float readMultipleWordCostMultiplier() const;
261    bool readRequiresGermanUmlautProcessing() const;
262    const EntryCounts readNgramCounts() const;
263    const EntryCounts readMaxNgramCounts() const;
264    static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes(
265            const uint8_t *const dictBuf);
266};
267} // namespace latinime
268#endif /* LATINIME_HEADER_POLICY_H */
269