12fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa/* 22fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Copyright (C) 2013, The Android Open Source Project 32fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 42fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Licensed under the Apache License, Version 2.0 (the "License"); 52fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * you may not use this file except in compliance with the License. 62fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * You may obtain a copy of the License at 72fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 82fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * http://www.apache.org/licenses/LICENSE-2.0 92fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 102fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Unless required by applicable law or agreed to in writing, software 112fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * distributed under the License is distributed on an "AS IS" BASIS, 122fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 132fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * See the License for the specific language governing permissions and 142fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * limitations under the License. 152fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */ 162fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 172fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#ifndef LATINIME_PROBABILITY_ENTRY_H 182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#define LATINIME_PROBABILITY_ENTRY_H 192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 2008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include <climits> 2108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi#include <cstdint> 2208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#include "defines.h" 2488bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/property/historical_info.h" 2588bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/property/ngram_property.h" 2688bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/property/unigram_property.h" 2788bc312ad34321fb3e81be2dc939a889d065f4a7Keisuke Kuroyanagi#include "dictionary/structure/v4/ver4_dict_constants.h" 282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasanamespace latinime { 302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaclass ProbabilityEntry { 322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public: 332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa ProbabilityEntry(const ProbabilityEntry &probabilityEntry) 342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa : mFlags(probabilityEntry.mFlags), mProbability(probabilityEntry.mProbability), 352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mHistoricalInfo(probabilityEntry.mHistoricalInfo) {} 362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // Dummy entry 382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa ProbabilityEntry() 394926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY), 404926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi mHistoricalInfo() {} 412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // Entry without historical information 432fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa ProbabilityEntry(const int flags, const int probability) 442fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa : mFlags(flags), mProbability(probability), mHistoricalInfo() {} 452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // Entry with historical information. 47623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi ProbabilityEntry(const int flags, const HistoricalInfo *const historicalInfo) 48623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi : mFlags(flags), mProbability(NOT_A_PROBABILITY), mHistoricalInfo(*historicalInfo) {} 492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 509a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi // Create from unigram property. 519a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi ProbabilityEntry(const UnigramProperty *const unigramProperty) 527d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(), 537d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), 547d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi unigramProperty->isPossiblyOffensive())), 55623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi mProbability(unigramProperty->getProbability()), 56287e155e44b4e937f2a62d010805702bc813c43bKeisuke Kuroyanagi mHistoricalInfo(unigramProperty->getHistoricalInfo()) {} 579a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi 5879bb37d499ed6fcabe981153d5ff0b5b69509933Keisuke Kuroyanagi // Create from ngram property. 599a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi // TODO: Set flags. 6079bb37d499ed6fcabe981153d5ff0b5b69509933Keisuke Kuroyanagi ProbabilityEntry(const NgramProperty *const ngramProperty) 6179bb37d499ed6fcabe981153d5ff0b5b69509933Keisuke Kuroyanagi : mFlags(0), mProbability(ngramProperty->getProbability()), 62287e155e44b4e937f2a62d010805702bc813c43bKeisuke Kuroyanagi mHistoricalInfo(ngramProperty->getHistoricalInfo()) {} 639a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi 649a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi bool isValid() const { 654926b90ec530ba1e247b7a0f6edd719b2b01870bKeisuke Kuroyanagi return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0; 669a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi } 679a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi 682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bool hasHistoricalInfo() const { 692fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mHistoricalInfo.isValid(); 702fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 712fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 72623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi uint8_t getFlags() const { 732fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mFlags; 742fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 752fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 762fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int getProbability() const { 772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mProbability; 782fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 802fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const HistoricalInfo *getHistoricalInfo() const { 812fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return &mHistoricalInfo; 822fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 832fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 84623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi bool representsBeginningOfSentence() const { 85623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0; 86623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi } 87623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi 887d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi bool isNotAWord() const { 897d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0; 907d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi } 917d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi 927d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi bool isBlacklisted() const { 937d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0; 947d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi } 957d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi 967d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi bool isPossiblyOffensive() const { 977d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0; 987d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi } 997d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi 10008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi uint64_t encode(const bool hasHistoricalInfo) const { 10172d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi uint64_t encodedEntry = static_cast<uint8_t>(mFlags); 10208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi if (hasHistoricalInfo) { 10308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi encodedEntry = (encodedEntry << (Ver4DictConstants::TIME_STAMP_FIELD_SIZE * CHAR_BIT)) 10472d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi | static_cast<uint32_t>(mHistoricalInfo.getTimestamp()); 10508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT)) 10672d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi | static_cast<uint8_t>(mHistoricalInfo.getLevel()); 10708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) 1082383575d2d695efcca093e69ed2daa88aec58862Keisuke Kuroyanagi | static_cast<uint16_t>(mHistoricalInfo.getCount()); 10908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } else { 11008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT)) 11172d17d920914c7846c3bc498554696aab6e0e5c5Keisuke Kuroyanagi | static_cast<uint8_t>(mProbability); 11208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 11308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return encodedEntry; 11408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 11508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 11608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi static ProbabilityEntry decode(const uint64_t encodedEntry, const bool hasHistoricalInfo) { 11708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi if (hasHistoricalInfo) { 11808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const int flags = readFromEncodedEntry(encodedEntry, 1199a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, 12008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi Ver4DictConstants::TIME_STAMP_FIELD_SIZE 12108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE 12208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); 12308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const int timestamp = readFromEncodedEntry(encodedEntry, 12408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi Ver4DictConstants::TIME_STAMP_FIELD_SIZE, 12508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi Ver4DictConstants::WORD_LEVEL_FIELD_SIZE 12608894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); 12708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const int level = readFromEncodedEntry(encodedEntry, 12808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, 12908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi Ver4DictConstants::WORD_COUNT_FIELD_SIZE); 13008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const int count = readFromEncodedEntry(encodedEntry, 13108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi Ver4DictConstants::WORD_COUNT_FIELD_SIZE, 0 /* pos */); 13208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const HistoricalInfo historicalInfo(timestamp, level, count); 133623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi return ProbabilityEntry(flags, &historicalInfo); 13408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } else { 13508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const int flags = readFromEncodedEntry(encodedEntry, 1369a23f0fba25137760a60e9bfaf6bf20a5889648cKeisuke Kuroyanagi Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, 13708894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi Ver4DictConstants::PROBABILITY_SIZE); 13808894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi const int probability = readFromEncodedEntry(encodedEntry, 13908894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi Ver4DictConstants::PROBABILITY_SIZE, 0 /* pos */); 14008894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return ProbabilityEntry(flags, probability); 14108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 14208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 14308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 1442fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa private: 1452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // Copy constructor is public to use this class as a type of return value. 1462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); 1472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 148623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi const uint8_t mFlags; 1492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int mProbability; 1502fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const HistoricalInfo mHistoricalInfo; 15108894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi 15208894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi static int readFromEncodedEntry(const uint64_t encodedEntry, const int size, const int pos) { 15308894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi return static_cast<int>( 15408894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi (encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1)); 15508894842662eff666a713a7f4deb79204a322f8cKeisuke Kuroyanagi } 156623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi 1577d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi static uint8_t createFlags(const bool representsBeginningOfSentence, 1587d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) { 159623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi uint8_t flags = 0; 160623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi if (representsBeginningOfSentence) { 1617d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; 1627d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi } 1637d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi if (isNotAWord) { 1647d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi flags |= Ver4DictConstants::FLAG_NOT_A_WORD; 1657d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi } 1667d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi if (isBlacklisted) { 1677d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi flags |= Ver4DictConstants::FLAG_BLACKLISTED; 1687d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi } 1697d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi if (isPossiblyOffensive) { 1707d911d6f91af56586fbca40672bfb77b494ee871Keisuke Kuroyanagi flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE; 171623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi } 172623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi return flags; 173623067a183caf62fbe33223675430a246b5ae13dKeisuke Kuroyanagi } 1742fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}; 1752fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} // namespace latinime 1762fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa#endif /* LATINIME_PROBABILITY_ENTRY_H */ 177