17898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project/* 27898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Copyright (C) 2009 The Android Open Source Project 37898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * 47898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Licensed under the Apache License, Version 2.0 (the "License"); 57898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * you may not use this file except in compliance with the License. 67898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * You may obtain a copy of the License at 77898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * 87898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * http://www.apache.org/licenses/LICENSE-2.0 97898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * 107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Unless required by applicable law or agreed to in writing, software 117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * distributed under the License is distributed on an "AS IS" BASIS, 127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * See the License for the specific language governing permissions and 147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * limitations under the License. 157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project */ 167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <assert.h> 187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <math.h> 197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <stdio.h> 207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <string.h> 217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <time.h> 227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include "../include/mystdlib.h" 237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include "../include/ngram.h" 247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectnamespace ime_pinyin { 267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#define ADD_COUNT 0.3 287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectint comp_double(const void *p1, const void *p2) { 307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2)) 317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return -1; 327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2)) 337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return 1; 347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return 0; 357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectinline double distance(double freq, double code) { 387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project // return fabs(freq - code); 397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return freq * fabs(log(freq) - log(code)); 407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// Find the index of the code value which is nearest to the given freq 437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectint qsearch_nearest(double code_book[], double freq, int start, int end) { 447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (start == end) 457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return start; 467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (start + 1 == end) { 487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (distance(freq, code_book[end]) > distance(freq, code_book[start])) 497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return start; 507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return end; 517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project int mid = (start + end) / 2; 547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (code_book[mid] > freq) 567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return qsearch_nearest(code_book, freq, start, mid); 577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project else 587898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return qsearch_nearest(code_book, freq, mid, end); 597898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 607898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 617898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectsize_t update_code_idx(double freqs[], size_t num, double code_book[], 627898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project CODEBOOK_TYPE *code_idx) { 637898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project size_t changed = 0; 647898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project for (size_t pos = 0; pos < num; pos++) { 657898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project CODEBOOK_TYPE idx; 667898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1); 677898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (idx != code_idx[pos]) 687898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project changed++; 697898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project code_idx[pos] = idx; 707898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 717898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return changed; 727898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 737898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 747898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectdouble recalculate_kernel(double freqs[], size_t num, double code_book[], 757898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project CODEBOOK_TYPE *code_idx) { 767898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double ret = 0; 777898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 787898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project size_t *item_num = new size_t[kCodeBookSize]; 797898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(item_num); 807898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project memset(item_num, 0, sizeof(size_t) * kCodeBookSize); 817898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 827898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double *cb_new = new double[kCodeBookSize]; 837898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(cb_new); 847898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project memset(cb_new, 0, sizeof(double) * kCodeBookSize); 857898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 867898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project for (size_t pos = 0; pos < num; pos++) { 877898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project ret += distance(freqs[pos], code_book[code_idx[pos]]); 887898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 897898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project cb_new[code_idx[pos]] += freqs[pos]; 907898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project item_num[code_idx[pos]] += 1; 917898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 927898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 937898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project for (size_t code = 0; code < kCodeBookSize; code++) { 947898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(item_num[code] > 0); 957898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project code_book[code] = cb_new[code] / item_num[code]; 967898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 977898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 987898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project delete [] item_num; 997898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project delete [] cb_new; 1007898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1017898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return ret; 1027898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 1037898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1047898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectvoid iterate_codes(double freqs[], size_t num, double code_book[], 1057898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project CODEBOOK_TYPE *code_idx) { 1067898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project size_t iter_num = 0; 1077898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double delta_last = 0; 1087898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project do { 1097898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project size_t changed = update_code_idx(freqs, num, code_book, code_idx); 1107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double delta = recalculate_kernel(freqs, num, code_book, code_idx); 1127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (kPrintDebug0) { 1147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project printf("---Unigram codebook iteration: %d : %d, %.9f\n", 1157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project iter_num, changed, delta); 1167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 1177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project iter_num++; 1187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (iter_num > 1 && 1207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001)) 1217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project break; 1227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project delta_last = delta; 1237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } while (true); 1247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 1257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source ProjectNGram* NGram::instance_ = NULL; 1287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source ProjectNGram::NGram() { 1307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project initialized_ = false; 1317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project idx_num_ = 0; 1327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project lma_freq_idx_ = NULL; 1337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project sys_score_compensation_ = 0; 1347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___ 1367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_codes_df_ = NULL; 1377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif 1387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_codes_ = NULL; 1397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 1407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source ProjectNGram::~NGram() { 1427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL != lma_freq_idx_) 1437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project free(lma_freq_idx_); 1447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___ 1467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL != freq_codes_df_) 1477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project free(freq_codes_df_); 1487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif 1497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL != freq_codes_) 1517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project free(freq_codes_); 1527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 1537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source ProjectNGram& NGram::get_instance() { 1557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL == instance_) 1567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project instance_ = new NGram(); 1577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return *instance_; 1587898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 1597898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1607898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectbool NGram::save_ngram(FILE *fp) { 1617898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (!initialized_ || NULL == fp) 1627898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 1637898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1647898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_) 1657898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 1667898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1677898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1) 1687898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 1697898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1707898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != 1717898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project kCodeBookSize) 1727898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 1737898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1747898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) 1757898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 1767898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1777898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return true; 1787898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 1797898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1807898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectbool NGram::load_ngram(FILE *fp) { 1817898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL == fp) 1827898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 1837898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1847898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project initialized_ = false; 1857898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1867898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 ) 1877898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 1887898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1897898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL != lma_freq_idx_) 1907898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project free(lma_freq_idx_); 1917898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1927898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL != freq_codes_) 1937898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project free(freq_codes_); 1947898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 1957898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project lma_freq_idx_ = static_cast<CODEBOOK_TYPE*> 1967898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project (malloc(idx_num_ * sizeof(CODEBOOK_TYPE))); 1977898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_codes_ = static_cast<LmaScoreType*> 1987898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project (malloc(kCodeBookSize * sizeof(LmaScoreType))); 1997898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2007898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL == lma_freq_idx_ || NULL == freq_codes_) 2017898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 2027898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2037898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != 2047898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project kCodeBookSize) 2057898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 2067898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2077898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) 2087898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 2097898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project initialized_ = true; 2117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project total_freq_none_sys_ = 0; 2137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return true; 2147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 2157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectvoid NGram::set_total_freq_none_sys(size_t freq_none_sys) { 2177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project total_freq_none_sys_ = freq_none_sys; 2187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (0 == total_freq_none_sys_) { 2197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project sys_score_compensation_ = 0; 2207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } else { 2217ebaf2381c91649d700097491af171c7e486bf6fGenqing Wu double factor = static_cast<double>(kSysDictTotalFreq) / ( 2227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project kSysDictTotalFreq + total_freq_none_sys_); 2237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project sys_score_compensation_ = static_cast<float>( 2247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project log(factor) * kLogValueAmplifier); 2257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 2267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 2277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The caller makes sure this oject is initialized. 2297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectfloat NGram::get_uni_psb(LemmaIdType lma_id) { 2307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) + 2317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project sys_score_compensation_; 2327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 2337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectfloat NGram::convert_psb_to_score(double psb) { 2357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project float score = static_cast<float>( 2367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project log(psb) * static_cast<double>(kLogValueAmplifier)); 2377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (score > static_cast<float>(kMaxScore)) { 2387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project score = static_cast<float>(kMaxScore); 2397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 2407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return score; 2417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 2427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___ 2447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectbool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num, 2457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project LemmaIdType next_idx_unused) { 2467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) 2477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 2487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double total_freq = 0; 2507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double *freqs = new double[next_idx_unused]; 2517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL == freqs) 2527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return false; 2537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freqs[0] = ADD_COUNT; 2557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project total_freq += freqs[0]; 2567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project LemmaIdType idx_now = 0; 2577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project for (size_t pos = 0; pos < lemma_num; pos++) { 2587898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (lemma_arr[pos].idx_by_hz == idx_now) 2597898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project continue; 2607898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project idx_now++; 2617898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2627898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(lemma_arr[pos].idx_by_hz == idx_now); 2637898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2647898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freqs[idx_now] = lemma_arr[pos].freq; 2657898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (freqs[idx_now] <= 0) 2667898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freqs[idx_now] = 0.3; 2677898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2687898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project total_freq += freqs[idx_now]; 2697898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 2707898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2717898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double max_freq = 0; 2727898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project idx_num_ = idx_now + 1; 2737898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(idx_now + 1 == next_idx_unused); 2747898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2757898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project for (size_t pos = 0; pos < idx_num_; pos++) { 2767898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freqs[pos] = freqs[pos] / total_freq; 2777898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(freqs[pos] > 0); 2787898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (freqs[pos] > max_freq) 2797898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project max_freq = freqs[pos]; 2807898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 2817898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2827898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project // calculate the code book 2837898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL == freq_codes_df_) 2847898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_codes_df_ = new double[kCodeBookSize]; 2857898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(freq_codes_df_); 2867898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize); 2877898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2887898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL == freq_codes_) 2897898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_codes_ = new LmaScoreType[kCodeBookSize]; 2907898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(freq_codes_); 2917898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize); 2927898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2937898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project size_t freq_pos = 0; 2947898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { 2957898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project bool found = true; 2967898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 2977898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project while (found) { 2987898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project found = false; 2997898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double cand = freqs[freq_pos]; 3007898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project for (size_t i = 0; i < code_pos; i++) 3017898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (freq_codes_df_[i] == cand) { 3027898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project found = true; 3037898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project break; 3047898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 3057898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (found) 3067898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_pos++; 3077898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 3087898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3097898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_codes_df_[code_pos] = freqs[freq_pos]; 3107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_pos++; 3117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 3127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double); 3147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (NULL == lma_freq_idx_) 3167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_]; 3177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project assert(lma_freq_idx_); 3187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_); 3207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project delete [] freqs; 3227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (kPrintDebug0) { 3247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project printf("\n------Language Model Unigram Codebook------\n"); 3257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 3267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { 3287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project double log_score = log(freq_codes_df_[code_pos]); 3297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project float final_score = convert_psb_to_score(freq_codes_df_[code_pos]); 3307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project if (kPrintDebug0) { 3317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n", 3327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project code_pos, freq_codes_df_[code_pos], log_score, final_score); 3337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 3347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score); 3357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project } 3367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project initialized_ = true; 3387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project return true; 3397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} 3407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif 3417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project 3427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} // namespace ime_pinyin 343