130088259480130e5bac5c2028e2c7c3e6d4c51a2satok/*
25460ea389d83722ac98abaef8a2bb9900fb571e7Ken Wakasa * Copyright (C) 2010, The Android Open Source Project
30bbb917d12358e0264796e75dea888f244761b64Ken Wakasa *
40bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * Licensed under the Apache License, Version 2.0 (the "License");
50bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * you may not use this file except in compliance with the License.
60bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * You may obtain a copy of the License at
70bbb917d12358e0264796e75dea888f244761b64Ken Wakasa *
80bbb917d12358e0264796e75dea888f244761b64Ken Wakasa *     http://www.apache.org/licenses/LICENSE-2.0
90bbb917d12358e0264796e75dea888f244761b64Ken Wakasa *
100bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * Unless required by applicable law or agreed to in writing, software
110bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * distributed under the License is distributed on an "AS IS" BASIS,
120bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
130bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * See the License for the specific language governing permissions and
140bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * limitations under the License.
150bbb917d12358e0264796e75dea888f244761b64Ken Wakasa */
1630088259480130e5bac5c2028e2c7c3e6d4c51a2satok
17f1008c550168e50f930ea1e043000b395ce0f129Ken Wakasa#include <cstring>
1818c28f431eadc1b451ca25d14fd683db4b234838satok
19e808e436cbd6f1aeadb5d61f354d03c3c50872a7satok#define LOG_TAG "LatinIME: bigram_dictionary.cpp"
20e808e436cbd6f1aeadb5d61f354d03c3c50872a7satok
2130088259480130e5bac5c2028e2c7c3e6d4c51a2satok#include "bigram_dictionary.h"
22a65c267b1f1207e54c6f821148c600e3899b7f9cKen Wakasa
233b088a2f365a9ce06f58243c83cb961ea2920b7eKen Wakasa#include "defines.h"
24a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
25a65c267b1f1207e54c6f821148c600e3899b7f9cKen Wakasa#include "suggest/core/dictionary/dictionary.h"
26d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
27addea83bad5751308fef508d79c6989b8872f050Ken Wakasa#include "utils/char_utils.h"
2830088259480130e5bac5c2028e2c7c3e6d4c51a2satok
2930088259480130e5bac5c2028e2c7c3e6d4c51a2satoknamespace latinime {
3030088259480130e5bac5c2028e2c7c3e6d4c51a2satok
31d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke KuroyanagiBigramDictionary::BigramDictionary(
32d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi        const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy)
33d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi        : mDictionaryStructurePolicy(dictionaryStructurePolicy) {
34de3070a71b39742c3ac7b613f45af88cc95c1205Ken Wakasa    if (DEBUG_DICT) {
359fb6f47a6a11f62d134d4d6259181ac987fc1ad3satok        AKLOGI("BigramDictionary - constructor");
36de3070a71b39742c3ac7b613f45af88cc95c1205Ken Wakasa    }
3718c28f431eadc1b451ca25d14fd683db4b234838satok}
3818c28f431eadc1b451ca25d14fd683db4b234838satok
3918c28f431eadc1b451ca25d14fd683db4b234838satokBigramDictionary::~BigramDictionary() {
4018c28f431eadc1b451ca25d14fd683db4b234838satok}
4118c28f431eadc1b451ca25d14fd683db4b234838satok
42e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataokavoid BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability,
431e61493c50082264caaef862df02b1ccc84dc396Ken Wakasa        int *bigramCodePoints, int *outputTypes) const {
4418c28f431eadc1b451ca25d14fd683db4b234838satok    word[length] = 0;
45e320789a62e2e1161673657241b664e9cbf31f7fSatoshi Kataoka    if (DEBUG_DICT_FULL) {
46ce9efbff53ba04bd719c3c15d8a5a501ff12714fDoug Kwan#ifdef FLAG_DBG
4718c28f431eadc1b451ca25d14fd683db4b234838satok        char s[length + 1];
481e61493c50082264caaef862df02b1ccc84dc396Ken Wakasa        for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]);
49e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka        AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability);
50787945bf1ef2e5449b5df16dfe15beeb0fd7cb71satok#endif
5118c28f431eadc1b451ca25d14fd683db4b234838satok    }
5218c28f431eadc1b451ca25d14fd683db4b234838satok
5318c28f431eadc1b451ca25d14fd683db4b234838satok    // Find the right insertion point
5418c28f431eadc1b451ca25d14fd683db4b234838satok    int insertAt = 0;
55f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    while (insertAt < MAX_RESULTS) {
56e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka        if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability
57464d3ba43257da34ab165da8ba0af11e928aae5cKen Wakasa                && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH,
581e61493c50082264caaef862df02b1ccc84dc396Ken Wakasa                        bigramCodePoints + insertAt * MAX_WORD_LENGTH))) {
5918c28f431eadc1b451ca25d14fd683db4b234838satok            break;
6018c28f431eadc1b451ca25d14fd683db4b234838satok        }
6118c28f431eadc1b451ca25d14fd683db4b234838satok        insertAt++;
6218c28f431eadc1b451ca25d14fd683db4b234838satok    }
63e320789a62e2e1161673657241b664e9cbf31f7fSatoshi Kataoka    if (DEBUG_DICT_FULL) {
64f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa        AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS);
65de3070a71b39742c3ac7b613f45af88cc95c1205Ken Wakasa    }
66f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    if (insertAt >= MAX_RESULTS) {
67f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa        return;
68f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    }
69e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka    memmove(bigramProbability + (insertAt + 1),
70e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka            bigramProbability + insertAt,
71e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka            (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0]));
72e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka    bigramProbability[insertAt] = probability;
73f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    outputTypes[insertAt] = Dictionary::KIND_PREDICTION;
74f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH,
75f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa            bigramCodePoints + insertAt * MAX_WORD_LENGTH,
76f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa            (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH);
77f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH;
78f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    while (length--) {
79f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa        *dest++ = *word++;
80f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    }
81f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    *dest = 0; // NULL terminate
82e320789a62e2e1161673657241b664e9cbf31f7fSatoshi Kataoka    if (DEBUG_DICT_FULL) {
83f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa        AKLOGI("Bigram: Added word at %d", insertAt);
8418c28f431eadc1b451ca25d14fd683db4b234838satok    }
8518c28f431eadc1b451ca25d14fd683db4b234838satok}
8618c28f431eadc1b451ca25d14fd683db4b234838satok
87588e2f296451a8eb074af9140d018b828105237fJean Chalard/* Parameters :
88588e2f296451a8eb074af9140d018b828105237fJean Chalard * prevWord: the word before, the one for which we need to look up bigrams.
89588e2f296451a8eb074af9140d018b828105237fJean Chalard * prevWordLength: its length.
902a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi * outBigramCodePoints: an array for output, at the same format as outwords for getSuggestions.
912a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi * outBigramProbability: an array to output frequencies.
926931df9c17aaeb04288f937cabf956c1b9eb0cc9Jean Chalard * outputTypes: an array to output types.
93588e2f296451a8eb074af9140d018b828105237fJean Chalard * This method returns the number of bigrams this word has, for backward compatibility.
94588e2f296451a8eb074af9140d018b828105237fJean Chalard */
952a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagiint BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength,
962a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi        int *const outBigramCodePoints, int *const outBigramProbability,
972a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi        int *const outputTypes) const {
98588e2f296451a8eb074af9140d018b828105237fJean Chalard    // TODO: remove unused arguments, and refrain from storing stuff in members of this class
99588e2f296451a8eb074af9140d018b828105237fJean Chalard    // TODO: have "in" arguments before "out" ones, and make out args explicit in the name
10018c28f431eadc1b451ca25d14fd683db4b234838satok
101e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard    int pos = getBigramListPositionForWord(prevWord, prevWordLength,
102e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard            false /* forceLowerCaseSearch */);
103351864b38a2a19a3b591efe3ed58a5998bb4c79dJean Chalard    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
104c38ec475962ad6b7f14abe35b950545ebcdbe3c5Keisuke Kuroynagi    if (NOT_A_DICT_POS == pos) {
105e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard        // If no bigrams for this exact word, search again in lower case.
106e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard        pos = getBigramListPositionForWord(prevWord, prevWordLength,
107e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard                true /* forceLowerCaseSearch */);
108e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard    }
109e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard    // If still no bigrams, we really don't have them!
110c38ec475962ad6b7f14abe35b950545ebcdbe3c5Keisuke Kuroynagi    if (NOT_A_DICT_POS == pos) return 0;
111a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi
112588e2f296451a8eb074af9140d018b828105237fJean Chalard    int bigramCount = 0;
113a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi    int unigramProbability = 0;
114a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi    int bigramBuffer[MAX_WORD_LENGTH];
115d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi    BinaryDictionaryBigramsIterator bigramsIt(
116d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi            mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
1175b7688bbb5ed01b534570e86a91ae1c724e23100Keisuke Kuroynagi    while (bigramsIt.hasNext()) {
118a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi        bigramsIt.next();
119cb816e5e16f086d98c8d05a0a5805c1cdfaf1c02Keisuke Kuroyanagi        if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) {
120202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi            continue;
121202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi        }
122202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi        const int codePointCount = mDictionaryStructurePolicy->
123e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi                getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(),
124e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi                        MAX_WORD_LENGTH, bigramBuffer, &unigramProbability);
125202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi        if (codePointCount <= 0) {
126202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi            continue;
127202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi        }
1282a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi        // Due to space constraints, the probability for bigrams is approximate - the lower the
1292a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi        // unigram probability, the worse the precision. The theoritical maximum error in
1302a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi        // resulting probability is 8 - although in the practice it's never bigger than 3 or 4
1312a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi        // in very bad cases. This means that sometimes, we'll see some bigrams interverted
1322a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi        // here, but it can't get too bad.
13365d19946bebd1cc6299e2789cc0fc097d1898e80Keisuke Kuroyanagi        const int probability = mDictionaryStructurePolicy->getProbability(
1342a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi                unigramProbability, bigramsIt.getProbability());
135202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi        addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability,
136202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi                outBigramCodePoints, outputTypes);
1372a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi        ++bigramCount;
138a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi    }
139f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa    return min(bigramCount, MAX_RESULTS);
14030088259480130e5bac5c2028e2c7c3e6d4c51a2satok}
14118c28f431eadc1b451ca25d14fd683db4b234838satok
142ee396df162b31cff9763dd10a7da2b47aef10c01Jean Chalard// Returns a pointer to the start of the bigram list.
1435b7688bbb5ed01b534570e86a91ae1c724e23100Keisuke Kuroynagi// If the word is not found or has no bigrams, this function returns NOT_A_DICT_POS.
144aa5a3e84ad330f55edda3087a9498c5ee16b9cbaKen Wakasaint BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength,
145aa5a3e84ad330f55edda3087a9498c5ee16b9cbaKen Wakasa        const bool forceLowerCaseSearch) const {
1465b7688bbb5ed01b534570e86a91ae1c724e23100Keisuke Kuroynagi    if (0 >= prevWordLength) return NOT_A_DICT_POS;
147d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi    int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength,
148668870be431d17ee4ceb5ce161aee1189063af18Keisuke Kuroyanagi            forceLowerCaseSearch);
149cb816e5e16f086d98c8d05a0a5805c1cdfaf1c02Keisuke Kuroyanagi    if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS;
15077ef75cbe6722d1eb45115c1ad82f963444d71cdKeisuke Kuroyanagi    return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos);
1519c2a96aa6cb6d8c1f7a559dbd7051302cfc6150bJean Chalard}
1529c2a96aa6cb6d8c1f7a559dbd7051302cfc6150bJean Chalard
1534d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagiint BigramDictionary::getBigramProbability(const int *word0, int length0, const int *word1,
1545bf1be71629607e7206e6203489cf742d2f8ed79Keisuke Kuroynagi        int length1) const {
1555bf1be71629607e7206e6203489cf742d2f8ed79Keisuke Kuroynagi    int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */);
1564d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang    // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams
1574d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi    if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY;
158d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi    int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1,
159668870be431d17ee4ceb5ce161aee1189063af18Keisuke Kuroyanagi            false /* forceLowerCaseSearch */);
1604d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi    if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY;
161a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi
162d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi    BinaryDictionaryBigramsIterator bigramsIt(
163d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi            mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos);
1645b7688bbb5ed01b534570e86a91ae1c724e23100Keisuke Kuroynagi    while (bigramsIt.hasNext()) {
165a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi        bigramsIt.next();
166a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi        if (bigramsIt.getBigramPos() == nextWordPos) {
1674d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi            return mDictionaryStructurePolicy->getProbability(
1684d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi                    mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos),
1694d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi                    bigramsIt.getProbability());
1704d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang        }
171a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi    }
1724d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi    return NOT_A_PROBABILITY;
1734d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang}
1744d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang
17530088259480130e5bac5c2028e2c7c3e6d4c51a2satok// TODO: Move functions related to bigram to here
17630088259480130e5bac5c2028e2c7c3e6d4c51a2satok} // namespace latinime
177