11ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi/*
21ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Copyright (C) 2013, The Android Open Source Project
31ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi *
41ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Licensed under the Apache License, Version 2.0 (the "License");
51ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * you may not use this file except in compliance with the License.
61ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * You may obtain a copy of the License at
71ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi *
81ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi *     http://www.apache.org/licenses/LICENSE-2.0
91ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi *
101ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Unless required by applicable law or agreed to in writing, software
111ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * distributed under the License is distributed on an "AS IS" BASIS,
121ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
131ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * See the License for the specific language governing permissions and
141ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * limitations under the License.
151ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi */
161ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
171ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi#include "suggest/core/dictionary/multi_bigram_map.h"
181ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
191ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi#include <cstddef>
208ca9be17db2f1845c7c7a3b584507cf60c9ca53dKen Wakasa#include <unordered_map>
211ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
221ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynaginamespace latinime {
231ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
241ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// Max number of bigram maps (previous word contexts) to be cached. Increasing this number
251ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory
261ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// usage. Also, there are diminishing returns since the most frequently used bigrams are
271ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// typically near the beginning of the input and are thus the first ones to be cached. Note
281ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// that these bigrams are reset for each new composing word.
291ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagiconst size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25;
301ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
311ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// Most common previous word contexts currently have 100 bigrams
321ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagiconst int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100;
331ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa// Look up the bigram probability for the given word pair from the cached bigram maps.
352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa// Also caches the bigrams if there is space remaining and they have not been cached already.
362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::getBigramProbability(
372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const DictionaryStructureWithBufferPolicy *const structurePolicy,
3835c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const int *const prevWordsPtNodePos, const int nextWordPosition,
3935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const int unigramProbability) {
4035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    if (!prevWordsPtNodePos || prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
4135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
4235c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    }
438ca9be17db2f1845c7c7a3b584507cf60c9ca53dKen Wakasa    std::unordered_map<int, BigramMap>::const_iterator mapPosition =
4435c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi            mBigramMaps.find(prevWordsPtNodePos[0]);
452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    if (mapPosition != mBigramMaps.end()) {
462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition,
472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa                unigramProbability);
482fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
5035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        addBigramsForWordPosition(structurePolicy, prevWordsPtNodePos);
5135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        return mBigramMaps[prevWordsPtNodePos[0]].getBigramProbability(structurePolicy,
522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa                nextWordPosition, unigramProbability);
532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
5435c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordsPtNodePos,
552fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            nextWordPosition, unigramProbability);
562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasavoid MultiBigramMap::BigramMap::init(
5935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const DictionaryStructureWithBufferPolicy *const structurePolicy,
6035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const int *const prevWordsPtNodePos) {
6135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    structurePolicy->iterateNgramEntries(prevWordsPtNodePos, this /* listener */);
622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
642fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::BigramMap::getBigramProbability(
652fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const DictionaryStructureWithBufferPolicy *const structurePolicy,
662fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const int nextWordPosition, const int unigramProbability) const {
672fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int bigramProbability = NOT_A_PROBABILITY;
682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    if (mBloomFilter.isInFilter(nextWordPosition)) {
698ca9be17db2f1845c7c7a3b584507cf60c9ca53dKen Wakasa        const std::unordered_map<int, int>::const_iterator bigramProbabilityIt =
702fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa                mBigramMap.find(nextWordPosition);
712fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        if (bigramProbabilityIt != mBigramMap.end()) {
722fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            bigramProbability = bigramProbabilityIt->second;
732fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        }
742fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
752fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    return structurePolicy->getProbability(unigramProbability, bigramProbability);
762fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
7835c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagivoid MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability,
7935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const int targetPtNodePos) {
8035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    if (targetPtNodePos == NOT_A_DICT_POS) {
8135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        return;
8235c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    }
8335c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    mBigramMap[targetPtNodePos] = ngramProbability;
8435c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    mBloomFilter.setInFilter(targetPtNodePos);
8535c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi}
8635c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi
872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasavoid MultiBigramMap::addBigramsForWordPosition(
8835c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const DictionaryStructureWithBufferPolicy *const structurePolicy,
8935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const int *const prevWordsPtNodePos) {
9035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    if (prevWordsPtNodePos) {
9135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        mBigramMaps[prevWordsPtNodePos[0]].init(structurePolicy, prevWordsPtNodePos);
9235c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    }
932fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
942fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
952fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::readBigramProbabilityFromBinaryDictionary(
9635c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const DictionaryStructureWithBufferPolicy *const structurePolicy,
9735c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const int *const prevWordsPtNodePos, const int nextWordPosition,
9835c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        const int unigramProbability) {
9935c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    const int bigramProbability = structurePolicy->getProbabilityOfPtNode(prevWordsPtNodePos,
10035c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi            nextWordPosition);
10135c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    if (bigramProbability != NOT_A_PROBABILITY) {
10235c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi        return bigramProbability;
1032fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
10435c62b2cc99761e97f57060ad5e3cdfad926aea7Keisuke Kuroyanagi    return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
1052fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
1062fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1071ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi} // namespace latinime
108