multi_bigram_map.cpp revision 2fa3693c264a4c150ac307d9bb7f6f8f18cc4ffc
11ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi/* 21ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Copyright (C) 2013, The Android Open Source Project 31ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * 41ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Licensed under the Apache License, Version 2.0 (the "License"); 51ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * you may not use this file except in compliance with the License. 61ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * You may obtain a copy of the License at 71ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * 81ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * http://www.apache.org/licenses/LICENSE-2.0 91ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * 101ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Unless required by applicable law or agreed to in writing, software 111ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * distributed under the License is distributed on an "AS IS" BASIS, 121ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 131ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * See the License for the specific language governing permissions and 141ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * limitations under the License. 151ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi */ 161ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 171ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi#include "suggest/core/dictionary/multi_bigram_map.h" 181ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 191ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi#include <cstddef> 201ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 211ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynaginamespace latinime { 221ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 231ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// Max number of bigram maps (previous word contexts) to be cached. Increasing this number 241ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory 251ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// usage. Also, there are diminishing returns since the most frequently used bigrams are 261ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// typically near the beginning of the input and are thus the first ones to be cached. Note 271ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// that these bigrams are reset for each new composing word. 281ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagiconst size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; 291ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 301ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// Most common previous word contexts currently have 100 bigrams 311ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagiconst int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; 321ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi 332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa// Look up the bigram probability for the given word pair from the cached bigram maps. 342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa// Also caches the bigrams if there is space remaining and they have not been cached already. 352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::getBigramProbability( 362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryStructureWithBufferPolicy *const structurePolicy, 372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int wordPosition, const int nextWordPosition, const int unigramProbability) { 382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa hash_map_compat<int, BigramMap>::const_iterator mapPosition = 392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mBigramMaps.find(wordPosition); 402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (mapPosition != mBigramMaps.end()) { 412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition, 422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa unigramProbability); 432fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 442fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { 452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa addBigramsForWordPosition(structurePolicy, wordPosition); 462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return mBigramMaps[wordPosition].getBigramProbability(structurePolicy, 472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa nextWordPosition, unigramProbability); 482fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition, 502fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa nextWordPosition, unigramProbability); 512fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasavoid MultiBigramMap::BigramMap::init( 542fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos) { 552fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); 562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), 572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bigramsListPos); 582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa while (bigramsIt.hasNext()) { 592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bigramsIt.next(); 602fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { 612fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa continue; 622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability(); 642fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mBloomFilter.setInFilter(bigramsIt.getBigramPos()); 652fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 662fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 672fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::BigramMap::getBigramProbability( 692fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryStructureWithBufferPolicy *const structurePolicy, 702fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int nextWordPosition, const int unigramProbability) const { 712fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int bigramProbability = NOT_A_PROBABILITY; 722fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (mBloomFilter.isInFilter(nextWordPosition)) { 732fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const hash_map_compat<int, int>::const_iterator bigramProbabilityIt = 742fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mBigramMap.find(nextWordPosition); 752fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (bigramProbabilityIt != mBigramMap.end()) { 762fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bigramProbability = bigramProbabilityIt->second; 772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 782fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return structurePolicy->getProbability(unigramProbability, bigramProbability); 802fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 812fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 822fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasavoid MultiBigramMap::addBigramsForWordPosition( 832fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position) { 842fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa mBigramMaps[position].init(structurePolicy, position); 852fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::readBigramProbabilityFromBinaryDictionary( 882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, 892fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int nextWordPosition, const int unigramProbability) { 902fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa int bigramProbability = NOT_A_PROBABILITY; 912fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); 922fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), 932fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bigramsListPos); 942fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa while (bigramsIt.hasNext()) { 952fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bigramsIt.next(); 962fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (bigramsIt.getBigramPos() == nextWordPosition) { 972fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa bigramProbability = bigramsIt.getProbability(); 982fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa break; 992fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 1002fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 1012fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return structurePolicy->getProbability(unigramProbability, bigramProbability); 1022fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa} 1032fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1041ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi} // namespace latinime 105