12fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa/*
22fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Copyright (C) 2013, The Android Open Source Project
32fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
42fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Licensed under the Apache License, Version 2.0 (the "License");
52fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * you may not use this file except in compliance with the License.
62fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * You may obtain a copy of the License at
72fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
82fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *     http://www.apache.org/licenses/LICENSE-2.0
92fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa *
102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Unless required by applicable law or agreed to in writing, software
112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * distributed under the License is distributed on an "AS IS" BASIS,
122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * See the License for the specific language governing permissions and
142fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * limitations under the License.
152fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */
162fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
172fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#ifndef LATINIME_PROBABILITY_ENTRY_H
182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#define LATINIME_PROBABILITY_ENTRY_H
192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
2008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include <climits>
2108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include <cstdint>
2208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "defines.h"
2488bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/property/historical_info.h"
2588bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/property/ngram_property.h"
2688bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/property/unigram_property.h"
2788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_dict_constants.h"
282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasanamespace latinime {
302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass ProbabilityEntry {
322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public:
332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    ProbabilityEntry(const ProbabilityEntry &probabilityEntry)
342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability),
352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa              mHistoricalInfo(probabilityEntry.mHistoricalInfo) {}
362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // Dummy entry
382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    ProbabilityEntry()
394926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi            : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY),
404926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi              mHistoricalInfo() {}
412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // Entry without historical information
432fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    ProbabilityEntry(const int flags, const int probability)
442fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            : mFlags(flags), mProbability(probability), mHistoricalInfo() {}
452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // Entry with historical information.
47623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi    ProbabilityEntry(const int flags, const HistoricalInfo *const historicalInfo)
48623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi            : mFlags(flags), mProbability(NOT_A_PROBABILITY), mHistoricalInfo(*historicalInfo) {}
492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
509a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi    // Create from unigram property.
519a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi    ProbabilityEntry(const UnigramProperty *const unigramProperty)
527d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi            : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(),
537d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi                    unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
547d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi                    unigramProperty->isPossiblyOffensive())),
55623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi              mProbability(unigramProperty->getProbability()),
56287e155e44b4e937f2a62d010805702bc813c43bKeisuke Kuroyanagi              mHistoricalInfo(unigramProperty->getHistoricalInfo()) {}
579a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi
5879bb37d499ed6fcabe981153d5ff0b5b69509933Keisuke Kuroyanagi    // Create from ngram property.
599a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi    // TODO: Set flags.
6079bb37d499ed6fcabe981153d5ff0b5b69509933Keisuke Kuroyanagi    ProbabilityEntry(const NgramProperty *const ngramProperty)
6179bb37d499ed6fcabe981153d5ff0b5b69509933Keisuke Kuroyanagi            : mFlags(0), mProbability(ngramProperty->getProbability()),
62287e155e44b4e937f2a62d010805702bc813c43bKeisuke Kuroyanagi              mHistoricalInfo(ngramProperty->getHistoricalInfo()) {}
639a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi
649a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi    bool isValid() const {
654926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi        return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
669a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi    }
679a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi
682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    bool hasHistoricalInfo() const {
692fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mHistoricalInfo.isValid();
702fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
712fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
72623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi    uint8_t getFlags() const {
732fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mFlags;
742fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
752fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
762fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int getProbability() const {
772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mProbability;
782fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
802fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const HistoricalInfo *getHistoricalInfo() const {
812fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return &mHistoricalInfo;
822fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
832fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
84623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi    bool representsBeginningOfSentence() const {
85623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi        return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0;
86623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi    }
87623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi
887d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi    bool isNotAWord() const {
897d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0;
907d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi    }
917d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi
927d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi    bool isBlacklisted() const {
937d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0;
947d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi    }
957d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi
967d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi    bool isPossiblyOffensive() const {
977d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0;
987d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi    }
997d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi
10008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    uint64_t encode(const bool hasHistoricalInfo) const {
10172d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi        uint64_t encodedEntry = static_cast<uint8_t>(mFlags);
10208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        if (hasHistoricalInfo) {
10308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            encodedEntry = (encodedEntry << (Ver4DictConstants::TIME_STAMP_FIELD_SIZE * CHAR_BIT))
10472d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                    | static_cast<uint32_t>(mHistoricalInfo.getTimestamp());
10508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT))
10672d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                    | static_cast<uint8_t>(mHistoricalInfo.getLevel());
10708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT))
1082383575d2d695efcca093e69ed2daa88aec58862Keisuke Kuroyanagi                    | static_cast<uint16_t>(mHistoricalInfo.getCount());
10908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        } else {
11008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT))
11172d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi                    | static_cast<uint8_t>(mProbability);
11208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        }
11308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        return encodedEntry;
11408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    }
11508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
11608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    static ProbabilityEntry decode(const uint64_t encodedEntry, const bool hasHistoricalInfo) {
11708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        if (hasHistoricalInfo) {
11808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const int flags = readFromEncodedEntry(encodedEntry,
1199a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi                    Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE,
12008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                    Ver4DictConstants::TIME_STAMP_FIELD_SIZE
12108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                            + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
12208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                            + Ver4DictConstants::WORD_COUNT_FIELD_SIZE);
12308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const int timestamp = readFromEncodedEntry(encodedEntry,
12408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                    Ver4DictConstants::TIME_STAMP_FIELD_SIZE,
12508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                    Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
12608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                            + Ver4DictConstants::WORD_COUNT_FIELD_SIZE);
12708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const int level = readFromEncodedEntry(encodedEntry,
12808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                    Ver4DictConstants::WORD_LEVEL_FIELD_SIZE,
12908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                    Ver4DictConstants::WORD_COUNT_FIELD_SIZE);
13008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const int count = readFromEncodedEntry(encodedEntry,
13108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                    Ver4DictConstants::WORD_COUNT_FIELD_SIZE, 0 /* pos */);
13208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const HistoricalInfo historicalInfo(timestamp, level, count);
133623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi            return ProbabilityEntry(flags, &historicalInfo);
13408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        } else {
13508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const int flags = readFromEncodedEntry(encodedEntry,
1369a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi                    Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE,
13708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                    Ver4DictConstants::PROBABILITY_SIZE);
13808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            const int probability = readFromEncodedEntry(encodedEntry,
13908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                    Ver4DictConstants::PROBABILITY_SIZE, 0 /* pos */);
14008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi            return ProbabilityEntry(flags, probability);
14108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        }
14208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    }
14308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
1442fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa private:
1452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    // Copy constructor is public to use this class as a type of return value.
1462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry);
1472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
148623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi    const uint8_t mFlags;
1492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const int mProbability;
1502fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const HistoricalInfo mHistoricalInfo;
15108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi
15208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    static int readFromEncodedEntry(const uint64_t encodedEntry, const int size, const int pos) {
15308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi        return static_cast<int>(
15408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi                (encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1));
15508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi    }
156623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi
1577d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi    static uint8_t createFlags(const bool representsBeginningOfSentence,
1587d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi            const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) {
159623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi        uint8_t flags = 0;
160623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi        if (representsBeginningOfSentence) {
1617d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi            flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
1627d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        }
1637d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        if (isNotAWord) {
1647d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi            flags |= Ver4DictConstants::FLAG_NOT_A_WORD;
1657d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        }
1667d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        if (isBlacklisted) {
1677d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi            flags |= Ver4DictConstants::FLAG_BLACKLISTED;
1687d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        }
1697d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi        if (isPossiblyOffensive) {
1707d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi            flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE;
171623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi        }
172623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi        return flags;
173623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi    }
1742fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa};
1752fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} // namespace latinime
1762fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#endif /* LATINIME_PROBABILITY_ENTRY_H */
177