17898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project/*
27898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Copyright (C) 2009 The Android Open Source Project
37898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
47898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Licensed under the Apache License, Version 2.0 (the "License");
57898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * you may not use this file except in compliance with the License.
67898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * You may obtain a copy of the License at
77898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
87898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *      http://www.apache.org/licenses/LICENSE-2.0
97898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Unless required by applicable law or agreed to in writing, software
117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * distributed under the License is distributed on an "AS IS" BASIS,
127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * See the License for the specific language governing permissions and
147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * limitations under the License.
157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project */
167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <assert.h>
187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <math.h>
197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <stdio.h>
207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <string.h>
217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <time.h>
227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include "../include/mystdlib.h"
237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include "../include/ngram.h"
247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectnamespace ime_pinyin {
267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#define ADD_COUNT 0.3
287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectint comp_double(const void *p1, const void *p2) {
307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2))
317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return -1;
327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2))
337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return 1;
347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return 0;
357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectinline double distance(double freq, double code) {
387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // return fabs(freq - code);
397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return freq * fabs(log(freq) - log(code));
407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// Find the index of the code value which is nearest to the given freq
437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectint qsearch_nearest(double code_book[], double freq, int start, int end) {
447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (start == end)
457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return start;
467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (start + 1 == end) {
487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    if (distance(freq, code_book[end]) > distance(freq, code_book[start]))
497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      return start;
507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return end;
517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  int mid = (start + end) / 2;
547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (code_book[mid] > freq)
567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return qsearch_nearest(code_book, freq, start, mid);
577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  else
587898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return qsearch_nearest(code_book, freq, mid, end);
597898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
607898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
617898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectsize_t update_code_idx(double freqs[], size_t num, double code_book[],
627898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project                       CODEBOOK_TYPE *code_idx) {
637898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  size_t changed = 0;
647898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  for (size_t pos = 0; pos < num; pos++) {
657898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    CODEBOOK_TYPE idx;
667898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
677898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    if (idx != code_idx[pos])
687898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      changed++;
697898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    code_idx[pos] = idx;
707898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
717898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return changed;
727898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
737898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
747898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectdouble recalculate_kernel(double freqs[], size_t num, double code_book[],
757898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project                          CODEBOOK_TYPE *code_idx) {
767898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  double ret = 0;
777898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
787898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  size_t *item_num =  new size_t[kCodeBookSize];
797898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  assert(item_num);
807898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  memset(item_num, 0, sizeof(size_t) * kCodeBookSize);
817898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
827898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  double *cb_new = new double[kCodeBookSize];
837898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  assert(cb_new);
847898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  memset(cb_new, 0, sizeof(double) * kCodeBookSize);
857898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
867898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  for (size_t pos = 0; pos < num; pos++) {
877898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    ret += distance(freqs[pos], code_book[code_idx[pos]]);
887898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
897898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    cb_new[code_idx[pos]] += freqs[pos];
907898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    item_num[code_idx[pos]] += 1;
917898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
927898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
937898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  for (size_t code = 0; code < kCodeBookSize; code++) {
947898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    assert(item_num[code] > 0);
957898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    code_book[code] = cb_new[code] / item_num[code];
967898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
977898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
987898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  delete [] item_num;
997898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  delete [] cb_new;
1007898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1017898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return ret;
1027898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
1037898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1047898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectvoid iterate_codes(double freqs[], size_t num, double code_book[],
1057898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project                   CODEBOOK_TYPE *code_idx) {
1067898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  size_t iter_num = 0;
1077898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  double delta_last = 0;
1087898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  do {
1097898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    size_t changed = update_code_idx(freqs, num, code_book, code_idx);
1107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    double delta = recalculate_kernel(freqs, num, code_book, code_idx);
1127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    if (kPrintDebug0) {
1147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      printf("---Unigram codebook iteration: %d : %d, %.9f\n",
1157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project             iter_num, changed, delta);
1167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    }
1177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    iter_num++;
1187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    if (iter_num > 1 &&
1207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project        (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001))
1217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      break;
1227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    delta_last = delta;
1237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  } while (true);
1247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
1257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source ProjectNGram* NGram::instance_ = NULL;
1287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source ProjectNGram::NGram() {
1307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  initialized_ = false;
1317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  idx_num_ = 0;
1327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  lma_freq_idx_ = NULL;
1337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  sys_score_compensation_ = 0;
1347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___
1367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  freq_codes_df_ = NULL;
1377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif
1387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  freq_codes_ = NULL;
1397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
1407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source ProjectNGram::~NGram() {
1427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL != lma_freq_idx_)
1437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    free(lma_freq_idx_);
1447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___
1467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL != freq_codes_df_)
1477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    free(freq_codes_df_);
1487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif
1497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL != freq_codes_)
1517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    free(freq_codes_);
1527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
1537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source ProjectNGram& NGram::get_instance() {
1557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL == instance_)
1567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    instance_ = new NGram();
1577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return *instance_;
1587898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
1597898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1607898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectbool NGram::save_ngram(FILE *fp) {
1617898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (!initialized_ || NULL == fp)
1627898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
1637898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1647898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (0 == idx_num_ || NULL == freq_codes_ ||  NULL == lma_freq_idx_)
1657898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
1667898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1677898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1)
1687898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
1697898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1707898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
1717898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      kCodeBookSize)
1727898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
1737898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1747898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
1757898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
1767898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1777898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return true;
1787898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
1797898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1807898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectbool NGram::load_ngram(FILE *fp) {
1817898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL == fp)
1827898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
1837898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1847898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  initialized_ = false;
1857898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1867898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 )
1877898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
1887898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1897898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL != lma_freq_idx_)
1907898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    free(lma_freq_idx_);
1917898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1927898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL != freq_codes_)
1937898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    free(freq_codes_);
1947898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1957898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  lma_freq_idx_ = static_cast<CODEBOOK_TYPE*>
1967898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project                  (malloc(idx_num_ * sizeof(CODEBOOK_TYPE)));
1977898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  freq_codes_ = static_cast<LmaScoreType*>
1987898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      (malloc(kCodeBookSize * sizeof(LmaScoreType)));
1997898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2007898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL == lma_freq_idx_ || NULL == freq_codes_)
2017898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
2027898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2037898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
2047898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      kCodeBookSize)
2057898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
2067898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2077898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
2087898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
2097898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  initialized_ = true;
2117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  total_freq_none_sys_ = 0;
2137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return true;
2147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
2157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectvoid NGram::set_total_freq_none_sys(size_t freq_none_sys) {
2177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  total_freq_none_sys_ = freq_none_sys;
2187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (0 == total_freq_none_sys_) {
2197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    sys_score_compensation_ = 0;
2207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  } else {
2217ebaf2381c91649d700097491af171c7e486bf6fGenqing Wu    double factor = static_cast<double>(kSysDictTotalFreq) / (
2227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project        kSysDictTotalFreq + total_freq_none_sys_);
2237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    sys_score_compensation_ = static_cast<float>(
2247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project        log(factor) * kLogValueAmplifier);
2257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
2267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
2277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The caller makes sure this oject is initialized.
2297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectfloat NGram::get_uni_psb(LemmaIdType lma_id) {
2307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return  static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) +
2317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      sys_score_compensation_;
2327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
2337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectfloat NGram::convert_psb_to_score(double psb) {
2357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  float score = static_cast<float>(
2367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      log(psb) * static_cast<double>(kLogValueAmplifier));
2377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (score > static_cast<float>(kMaxScore)) {
2387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    score = static_cast<float>(kMaxScore);
2397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
2407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return score;
2417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
2427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___
2447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectbool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
2457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project                          LemmaIdType next_idx_unused) {
2467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1)
2477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
2487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  double total_freq = 0;
2507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  double *freqs = new double[next_idx_unused];
2517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL == freqs)
2527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    return false;
2537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  freqs[0] = ADD_COUNT;
2557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  total_freq += freqs[0];
2567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  LemmaIdType idx_now = 0;
2577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  for (size_t pos = 0; pos < lemma_num; pos++) {
2587898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    if (lemma_arr[pos].idx_by_hz == idx_now)
2597898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      continue;
2607898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    idx_now++;
2617898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2627898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    assert(lemma_arr[pos].idx_by_hz == idx_now);
2637898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2647898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    freqs[idx_now] = lemma_arr[pos].freq;
2657898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    if (freqs[idx_now] <= 0)
2667898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      freqs[idx_now] = 0.3;
2677898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2687898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    total_freq += freqs[idx_now];
2697898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
2707898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2717898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  double max_freq = 0;
2727898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  idx_num_ = idx_now + 1;
2737898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  assert(idx_now + 1 == next_idx_unused);
2747898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2757898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  for (size_t pos = 0; pos < idx_num_; pos++) {
2767898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    freqs[pos] = freqs[pos] / total_freq;
2777898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    assert(freqs[pos] > 0);
2787898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    if (freqs[pos] > max_freq)
2797898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      max_freq = freqs[pos];
2807898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
2817898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2827898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // calculate the code book
2837898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL == freq_codes_df_)
2847898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    freq_codes_df_ = new double[kCodeBookSize];
2857898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  assert(freq_codes_df_);
2867898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize);
2877898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2887898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL == freq_codes_)
2897898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    freq_codes_ = new LmaScoreType[kCodeBookSize];
2907898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  assert(freq_codes_);
2917898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize);
2927898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2937898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  size_t freq_pos = 0;
2947898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
2957898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    bool found = true;
2967898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
2977898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    while (found) {
2987898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      found = false;
2997898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      double cand = freqs[freq_pos];
3007898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      for (size_t i = 0; i < code_pos; i++)
3017898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project        if (freq_codes_df_[i] == cand) {
3027898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project          found = true;
3037898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project          break;
3047898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project        }
3057898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      if (found)
3067898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project        freq_pos++;
3077898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    }
3087898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3097898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    freq_codes_df_[code_pos] = freqs[freq_pos];
3107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    freq_pos++;
3117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
3127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
3147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (NULL == lma_freq_idx_)
3167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_];
3177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  assert(lma_freq_idx_);
3187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);
3207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  delete [] freqs;
3227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  if (kPrintDebug0) {
3247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    printf("\n------Language Model Unigram Codebook------\n");
3257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
3267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
3287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    double log_score = log(freq_codes_df_[code_pos]);
3297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
3307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    if (kPrintDebug0) {
3317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project      printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n",
3327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project             code_pos, freq_codes_df_[code_pos], log_score, final_score);
3337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    }
3347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project    freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
3357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  }
3367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  initialized_ = true;
3387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  return true;
3397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
3407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif
3417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
3427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}  // namespace ime_pinyin
343