multi_bigram_map.cpp revision 2fa3693c264a4c150ac307d9bb7f6f8f18cc4ffc
11ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi/*
21ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Copyright (C) 2013, The Android Open Source Project
31ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi *
41ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Licensed under the Apache License, Version 2.0 (the "License");
51ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * you may not use this file except in compliance with the License.
61ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * You may obtain a copy of the License at
71ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi *
81ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi *     http://www.apache.org/licenses/LICENSE-2.0
91ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi *
101ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * Unless required by applicable law or agreed to in writing, software
111ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * distributed under the License is distributed on an "AS IS" BASIS,
121ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
131ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * See the License for the specific language governing permissions and
141ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi * limitations under the License.
151ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi */
161ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
171ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi#include "suggest/core/dictionary/multi_bigram_map.h"
181ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
191ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi#include <cstddef>
201ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
211ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynaginamespace latinime {
221ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
231ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// Max number of bigram maps (previous word contexts) to be cached. Increasing this number
241ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory
251ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// usage. Also, there are diminishing returns since the most frequently used bigrams are
261ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// typically near the beginning of the input and are thus the first ones to be cached. Note
271ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// that these bigrams are reset for each new composing word.
281ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagiconst size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25;
291ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
301ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi// Most common previous word contexts currently have 100 bigrams
311ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagiconst int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100;
321ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi
332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa// Look up the bigram probability for the given word pair from the cached bigram maps.
342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa// Also caches the bigrams if there is space remaining and they have not been cached already.
352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::getBigramProbability(
362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const DictionaryStructureWithBufferPolicy *const structurePolicy,
372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const int wordPosition, const int nextWordPosition, const int unigramProbability) {
382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    hash_map_compat<int, BigramMap>::const_iterator mapPosition =
392fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            mBigramMaps.find(wordPosition);
402fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    if (mapPosition != mBigramMaps.end()) {
412fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition,
422fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa                unigramProbability);
432fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
442fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
452fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        addBigramsForWordPosition(structurePolicy, wordPosition);
462fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return mBigramMaps[wordPosition].getBigramProbability(structurePolicy,
472fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa                nextWordPosition, unigramProbability);
482fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
492fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition,
502fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            nextWordPosition, unigramProbability);
512fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
522fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
532fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasavoid MultiBigramMap::BigramMap::init(
542fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos) {
552fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos);
562fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
572fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            bigramsListPos);
582fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    while (bigramsIt.hasNext()) {
592fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        bigramsIt.next();
602fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
612fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            continue;
622fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        }
632fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability();
642fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        mBloomFilter.setInFilter(bigramsIt.getBigramPos());
652fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
662fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
672fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::BigramMap::getBigramProbability(
692fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const DictionaryStructureWithBufferPolicy *const structurePolicy,
702fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const int nextWordPosition, const int unigramProbability) const {
712fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int bigramProbability = NOT_A_PROBABILITY;
722fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    if (mBloomFilter.isInFilter(nextWordPosition)) {
732fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const hash_map_compat<int, int>::const_iterator bigramProbabilityIt =
742fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa                mBigramMap.find(nextWordPosition);
752fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        if (bigramProbabilityIt != mBigramMap.end()) {
762fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            bigramProbability = bigramProbabilityIt->second;
772fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        }
782fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
792fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    return structurePolicy->getProbability(unigramProbability, bigramProbability);
802fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
812fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
822fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasavoid MultiBigramMap::addBigramsForWordPosition(
832fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position) {
842fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    mBigramMaps[position].init(structurePolicy, position);
852fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
872fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasaint MultiBigramMap::readBigramProbabilityFromBinaryDictionary(
882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos,
892fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        const int nextWordPosition, const int unigramProbability) {
902fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    int bigramProbability = NOT_A_PROBABILITY;
912fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos);
922fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(),
932fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            bigramsListPos);
942fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    while (bigramsIt.hasNext()) {
952fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        bigramsIt.next();
962fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        if (bigramsIt.getBigramPos() == nextWordPosition) {
972fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            bigramProbability = bigramsIt.getProbability();
982fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            break;
992fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        }
1002fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
1012fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    return structurePolicy->getProbability(unigramProbability, bigramProbability);
1022fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa}
1032fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
1041ff81e889045d35ff8420b266398e73239bd15c9Keisuke Kuroynagi} // namespace latinime
105