11ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi/* 21ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Copyright (C) 2013, The Android Open Source Project 31ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * 41ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Licensed under the Apache License, Version 2.0 (the "License"); 51ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * you may not use this file except in compliance with the License. 61ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * You may obtain a copy of the License at 71ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * 81ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * http://www.apache.org/licenses/LICENSE-2.0 91ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * 101ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Unless required by applicable law or agreed to in writing, software 111ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * distributed under the License is distributed on an "AS IS" BASIS, 121ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 131ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * See the License for the specific language governing permissions and 141ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * limitations under the License. 151ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi */ 161ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 171ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi#include "suggest/core/dictionary/multi_bigram_map.h" 181ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 191ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi#include <cstddef> 208ca9be17db2f1845c7c7a3b584507cf60c9ca53dKen Wakasa#include <unordered_map> 211ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 221ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynaginamespace latinime { 231ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 241ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// Max number of bigram maps (previous word contexts) to be cached. Increasing this number 251ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory 261ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// usage. Also, there are diminishing returns since the most frequently used bigrams are 271ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// typically near the beginning of the input and are thus the first ones to be cached. Note 281ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// that these bigrams are reset for each new composing word. 291ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagiconst size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; 301ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 311ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// Most common previous word contexts currently have 100 bigrams 321ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagiconst int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; 331ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa// Look up the bigram probability for the given word pair from the cached bigram maps. 352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa// Also caches the bigrams if there is space remaining and they have not been cached already. 362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::getBigramProbability( 372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryStructureWithBufferPolicy *const structurePolicy, 3835c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const int *const prevWordsPtNodePos, const int nextWordPosition, 3935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const int unigramProbability) { 4035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi if (!prevWordsPtNodePos || prevWordsPtNodePos[0] == NOT_A_DICT_POS) { 4135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); 4235c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi } 438ca9be17db2f1845c7c7a3b584507cf60c9ca53dKen Wakasa std::unordered_map<int, BigramMap>::const_iterator mapPosition = 4435c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi mBigramMaps.find(prevWordsPtNodePos[0]); 452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (mapPosition != mBigramMaps.end()) { 462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition, 472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa unigramProbability); 482fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { 5035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi addBigramsForWordPosition(structurePolicy, prevWordsPtNodePos); 5135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi return mBigramMaps[prevWordsPtNodePos[0]].getBigramProbability(structurePolicy, 522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa nextWordPosition, unigramProbability); 532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 5435c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordsPtNodePos, 552fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa nextWordPosition, unigramProbability); 562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasavoid MultiBigramMap::BigramMap::init( 5935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const structurePolicy, 6035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const int *const prevWordsPtNodePos) { 6135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi structurePolicy->iterateNgramEntries(prevWordsPtNodePos, this /* listener */); 622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 642fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::BigramMap::getBigramProbability( 652fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryStructureWithBufferPolicy *const structurePolicy, 662fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int nextWordPosition, const int unigramProbability) const { 672fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int bigramProbability = NOT_A_PROBABILITY; 682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (mBloomFilter.isInFilter(nextWordPosition)) { 698ca9be17db2f1845c7c7a3b584507cf60c9ca53dKen Wakasa const std::unordered_map<int, int>::const_iterator bigramProbabilityIt = 702fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mBigramMap.find(nextWordPosition); 712fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (bigramProbabilityIt != mBigramMap.end()) { 722fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bigramProbability = bigramProbabilityIt->second; 732fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 742fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 752fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return structurePolicy->getProbability(unigramProbability, bigramProbability); 762fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 7835c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagivoid MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, 7935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const int targetPtNodePos) { 8035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi if (targetPtNodePos == NOT_A_DICT_POS) { 8135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi return; 8235c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi } 8335c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi mBigramMap[targetPtNodePos] = ngramProbability; 8435c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi mBloomFilter.setInFilter(targetPtNodePos); 8535c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi} 8635c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi 872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasavoid MultiBigramMap::addBigramsForWordPosition( 8835c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const structurePolicy, 8935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const int *const prevWordsPtNodePos) { 9035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi if (prevWordsPtNodePos) { 9135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi mBigramMaps[prevWordsPtNodePos[0]].init(structurePolicy, prevWordsPtNodePos); 9235c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi } 932fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 942fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 952fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::readBigramProbabilityFromBinaryDictionary( 9635c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const structurePolicy, 9735c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const int *const prevWordsPtNodePos, const int nextWordPosition, 9835c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const int unigramProbability) { 9935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi const int bigramProbability = structurePolicy->getProbabilityOfPtNode(prevWordsPtNodePos, 10035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi nextWordPosition); 10135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi if (bigramProbability != NOT_A_PROBABILITY) { 10235c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi return bigramProbability; 1032fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 10435c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); 1052fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 1062fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1071ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi} // namespace latinime 108