130088259480130e5bac5c2028e2c7c3e6d4c51a2satok/* 25460ea389d83722ac98abaef8a2bb9900fb571e7Ken Wakasa * Copyright (C) 2010, The Android Open Source Project 30bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * 40bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * Licensed under the Apache License, Version 2.0 (the "License"); 50bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * you may not use this file except in compliance with the License. 60bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * You may obtain a copy of the License at 70bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * 80bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * http://www.apache.org/licenses/LICENSE-2.0 90bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * 100bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * Unless required by applicable law or agreed to in writing, software 110bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * distributed under the License is distributed on an "AS IS" BASIS, 120bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 130bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * See the License for the specific language governing permissions and 140bbb917d12358e0264796e75dea888f244761b64Ken Wakasa * limitations under the License. 150bbb917d12358e0264796e75dea888f244761b64Ken Wakasa */ 1630088259480130e5bac5c2028e2c7c3e6d4c51a2satok 17f1008c550168e50f930ea1e043000b395ce0f129Ken Wakasa#include <cstring> 1818c28f431eadc1b451ca25d14fd683db4b234838satok 19e808e436cbd6f1aeadb5d61f354d03c3c50872a7satok#define LOG_TAG "LatinIME: bigram_dictionary.cpp" 20e808e436cbd6f1aeadb5d61f354d03c3c50872a7satok 2130088259480130e5bac5c2028e2c7c3e6d4c51a2satok#include "bigram_dictionary.h" 22a65c267b1f1207e54c6f821148c600e3899b7f9cKen Wakasa 233b088a2f365a9ce06f58243c83cb961ea2920b7eKen Wakasa#include "defines.h" 24a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" 25a65c267b1f1207e54c6f821148c600e3899b7f9cKen Wakasa#include "suggest/core/dictionary/dictionary.h" 26d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" 27addea83bad5751308fef508d79c6989b8872f050Ken Wakasa#include "utils/char_utils.h" 2830088259480130e5bac5c2028e2c7c3e6d4c51a2satok 2930088259480130e5bac5c2028e2c7c3e6d4c51a2satoknamespace latinime { 3030088259480130e5bac5c2028e2c7c3e6d4c51a2satok 31d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke KuroyanagiBigramDictionary::BigramDictionary( 32d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy) 33d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi : mDictionaryStructurePolicy(dictionaryStructurePolicy) { 34de3070a71b39742c3ac7b613f45af88cc95c1205Ken Wakasa if (DEBUG_DICT) { 359fb6f47a6a11f62d134d4d6259181ac987fc1ad3satok AKLOGI("BigramDictionary - constructor"); 36de3070a71b39742c3ac7b613f45af88cc95c1205Ken Wakasa } 3718c28f431eadc1b451ca25d14fd683db4b234838satok} 3818c28f431eadc1b451ca25d14fd683db4b234838satok 3918c28f431eadc1b451ca25d14fd683db4b234838satokBigramDictionary::~BigramDictionary() { 4018c28f431eadc1b451ca25d14fd683db4b234838satok} 4118c28f431eadc1b451ca25d14fd683db4b234838satok 42e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataokavoid BigramDictionary::addWordBigram(int *word, int length, int probability, int *bigramProbability, 431e61493c50082264caaef862df02b1ccc84dc396Ken Wakasa int *bigramCodePoints, int *outputTypes) const { 4418c28f431eadc1b451ca25d14fd683db4b234838satok word[length] = 0; 45e320789a62e2e1161673657241b664e9cbf31f7fSatoshi Kataoka if (DEBUG_DICT_FULL) { 46ce9efbff53ba04bd719c3c15d8a5a501ff12714fDoug Kwan#ifdef FLAG_DBG 4718c28f431eadc1b451ca25d14fd683db4b234838satok char s[length + 1]; 481e61493c50082264caaef862df02b1ccc84dc396Ken Wakasa for (int i = 0; i <= length; i++) s[i] = static_cast<char>(word[i]); 49e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka AKLOGI("Bigram: Found word = %s, freq = %d :", s, probability); 50787945bf1ef2e5449b5df16dfe15beeb0fd7cb71satok#endif 5118c28f431eadc1b451ca25d14fd683db4b234838satok } 5218c28f431eadc1b451ca25d14fd683db4b234838satok 5318c28f431eadc1b451ca25d14fd683db4b234838satok // Find the right insertion point 5418c28f431eadc1b451ca25d14fd683db4b234838satok int insertAt = 0; 55f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa while (insertAt < MAX_RESULTS) { 56e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka if (probability > bigramProbability[insertAt] || (bigramProbability[insertAt] == probability 57464d3ba43257da34ab165da8ba0af11e928aae5cKen Wakasa && length < CharUtils::getCodePointCount(MAX_WORD_LENGTH, 581e61493c50082264caaef862df02b1ccc84dc396Ken Wakasa bigramCodePoints + insertAt * MAX_WORD_LENGTH))) { 5918c28f431eadc1b451ca25d14fd683db4b234838satok break; 6018c28f431eadc1b451ca25d14fd683db4b234838satok } 6118c28f431eadc1b451ca25d14fd683db4b234838satok insertAt++; 6218c28f431eadc1b451ca25d14fd683db4b234838satok } 63e320789a62e2e1161673657241b664e9cbf31f7fSatoshi Kataoka if (DEBUG_DICT_FULL) { 64f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa AKLOGI("Bigram: InsertAt -> %d MAX_RESULTS: %d", insertAt, MAX_RESULTS); 65de3070a71b39742c3ac7b613f45af88cc95c1205Ken Wakasa } 66f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa if (insertAt >= MAX_RESULTS) { 67f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa return; 68f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa } 69e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka memmove(bigramProbability + (insertAt + 1), 70e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka bigramProbability + insertAt, 71e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka (MAX_RESULTS - insertAt - 1) * sizeof(bigramProbability[0])); 72e0e67373735918c78eaeaf24f127e1d28816aa29Satoshi Kataoka bigramProbability[insertAt] = probability; 73f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa outputTypes[insertAt] = Dictionary::KIND_PREDICTION; 74f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa memmove(bigramCodePoints + (insertAt + 1) * MAX_WORD_LENGTH, 75f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa bigramCodePoints + insertAt * MAX_WORD_LENGTH, 76f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa (MAX_RESULTS - insertAt - 1) * sizeof(bigramCodePoints[0]) * MAX_WORD_LENGTH); 77f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa int *dest = bigramCodePoints + insertAt * MAX_WORD_LENGTH; 78f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa while (length--) { 79f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa *dest++ = *word++; 80f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa } 81f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa *dest = 0; // NULL terminate 82e320789a62e2e1161673657241b664e9cbf31f7fSatoshi Kataoka if (DEBUG_DICT_FULL) { 83f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa AKLOGI("Bigram: Added word at %d", insertAt); 8418c28f431eadc1b451ca25d14fd683db4b234838satok } 8518c28f431eadc1b451ca25d14fd683db4b234838satok} 8618c28f431eadc1b451ca25d14fd683db4b234838satok 87588e2f296451a8eb074af9140d018b828105237fJean Chalard/* Parameters : 88588e2f296451a8eb074af9140d018b828105237fJean Chalard * prevWord: the word before, the one for which we need to look up bigrams. 89588e2f296451a8eb074af9140d018b828105237fJean Chalard * prevWordLength: its length. 902a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi * outBigramCodePoints: an array for output, at the same format as outwords for getSuggestions. 912a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi * outBigramProbability: an array to output frequencies. 926931df9c17aaeb04288f937cabf956c1b9eb0cc9Jean Chalard * outputTypes: an array to output types. 93588e2f296451a8eb074af9140d018b828105237fJean Chalard * This method returns the number of bigrams this word has, for backward compatibility. 94588e2f296451a8eb074af9140d018b828105237fJean Chalard */ 952a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagiint BigramDictionary::getPredictions(const int *prevWord, const int prevWordLength, 962a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi int *const outBigramCodePoints, int *const outBigramProbability, 972a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi int *const outputTypes) const { 98588e2f296451a8eb074af9140d018b828105237fJean Chalard // TODO: remove unused arguments, and refrain from storing stuff in members of this class 99588e2f296451a8eb074af9140d018b828105237fJean Chalard // TODO: have "in" arguments before "out" ones, and make out args explicit in the name 10018c28f431eadc1b451ca25d14fd683db4b234838satok 101e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard int pos = getBigramListPositionForWord(prevWord, prevWordLength, 102e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard false /* forceLowerCaseSearch */); 103351864b38a2a19a3b591efe3ed58a5998bb4c79dJean Chalard // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams 104c38ec475962ad6b7f14abe35b950545ebcdbe3c5Keisuke Kuroynagi if (NOT_A_DICT_POS == pos) { 105e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard // If no bigrams for this exact word, search again in lower case. 106e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard pos = getBigramListPositionForWord(prevWord, prevWordLength, 107e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard true /* forceLowerCaseSearch */); 108e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard } 109e9a86e2cdb58dd8d5601138294521e966d164520Jean Chalard // If still no bigrams, we really don't have them! 110c38ec475962ad6b7f14abe35b950545ebcdbe3c5Keisuke Kuroynagi if (NOT_A_DICT_POS == pos) return 0; 111a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi 112588e2f296451a8eb074af9140d018b828105237fJean Chalard int bigramCount = 0; 113a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi int unigramProbability = 0; 114a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi int bigramBuffer[MAX_WORD_LENGTH]; 115d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi BinaryDictionaryBigramsIterator bigramsIt( 116d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); 1175b7688bbb5ed01b534570e86a91ae1c724e23100Keisuke Kuroynagi while (bigramsIt.hasNext()) { 118a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi bigramsIt.next(); 119cb816e5e16f086d98c8d05a0a5805c1cdfaf1c02Keisuke Kuroyanagi if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { 120202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi continue; 121202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi } 122202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi const int codePointCount = mDictionaryStructurePolicy-> 123e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(), 124e1ebef6124241ef51d5ed17884e6299a330d496bKeisuke Kuroyanagi MAX_WORD_LENGTH, bigramBuffer, &unigramProbability); 125202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi if (codePointCount <= 0) { 126202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi continue; 127202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi } 1282a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi // Due to space constraints, the probability for bigrams is approximate - the lower the 1292a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi // unigram probability, the worse the precision. The theoritical maximum error in 1302a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 1312a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi // in very bad cases. This means that sometimes, we'll see some bigrams interverted 1322a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi // here, but it can't get too bad. 13365d19946bebd1cc6299e2789cc0fc097d1898e80Keisuke Kuroyanagi const int probability = mDictionaryStructurePolicy->getProbability( 1342a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi unigramProbability, bigramsIt.getProbability()); 135202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability, 136202e416b51ef4cf3553afeb305ca4b14dd6105e5Keisuke Kuroyanagi outBigramCodePoints, outputTypes); 1372a2aac2568e3f2da3efc8aeaa392696471d63417Keisuke Kuroynagi ++bigramCount; 138a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi } 139f6870cc82ddf394e94155322fcc7e4e2256bea66Ken Wakasa return min(bigramCount, MAX_RESULTS); 14030088259480130e5bac5c2028e2c7c3e6d4c51a2satok} 14118c28f431eadc1b451ca25d14fd683db4b234838satok 142ee396df162b31cff9763dd10a7da2b47aef10c01Jean Chalard// Returns a pointer to the start of the bigram list. 1435b7688bbb5ed01b534570e86a91ae1c724e23100Keisuke Kuroynagi// If the word is not found or has no bigrams, this function returns NOT_A_DICT_POS. 144aa5a3e84ad330f55edda3087a9498c5ee16b9cbaKen Wakasaint BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, 145aa5a3e84ad330f55edda3087a9498c5ee16b9cbaKen Wakasa const bool forceLowerCaseSearch) const { 1465b7688bbb5ed01b534570e86a91ae1c724e23100Keisuke Kuroynagi if (0 >= prevWordLength) return NOT_A_DICT_POS; 147d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength, 148668870be431d17ee4ceb5ce161aee1189063af18Keisuke Kuroyanagi forceLowerCaseSearch); 149cb816e5e16f086d98c8d05a0a5805c1cdfaf1c02Keisuke Kuroyanagi if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS; 15077ef75cbe6722d1eb45115c1ad82f963444d71cdKeisuke Kuroyanagi return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos); 1519c2a96aa6cb6d8c1f7a559dbd7051302cfc6150bJean Chalard} 1529c2a96aa6cb6d8c1f7a559dbd7051302cfc6150bJean Chalard 1534d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagiint BigramDictionary::getBigramProbability(const int *word0, int length0, const int *word1, 1545bf1be71629607e7206e6203489cf742d2f8ed79Keisuke Kuroynagi int length1) const { 1555bf1be71629607e7206e6203489cf742d2f8ed79Keisuke Kuroynagi int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */); 1564d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams 1574d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi if (NOT_A_DICT_POS == pos) return NOT_A_PROBABILITY; 158d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1, 159668870be431d17ee4ceb5ce161aee1189063af18Keisuke Kuroyanagi false /* forceLowerCaseSearch */); 1604d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY; 161a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi 162d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi BinaryDictionaryBigramsIterator bigramsIt( 163d81654cd61bd10f7cb56bfa4c89b34e9cfb18598Keisuke Kuroyanagi mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); 1645b7688bbb5ed01b534570e86a91ae1c724e23100Keisuke Kuroynagi while (bigramsIt.hasNext()) { 165a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi bigramsIt.next(); 166a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi if (bigramsIt.getBigramPos() == nextWordPos) { 1674d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi return mDictionaryStructurePolicy->getProbability( 1684d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos), 1694d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi bigramsIt.getProbability()); 1704d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang } 171a71ed8caa27c4a0174f25750171282980bc26880Keisuke Kuroynagi } 1724d02a2d44db94985c9f079cdd58c7c51d3e557eeKeisuke Kuroyanagi return NOT_A_PROBABILITY; 1734d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang} 1744d289d39aeae21064f63d958974816ceee3e9fdeTom Ouyang 17530088259480130e5bac5c2028e2c7c3e6d4c51a2satok// TODO: Move functions related to bigram to here 17630088259480130e5bac5c2028e2c7c3e6d4c51a2satok} // namespace latinime 177