1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <assert.h>
18#include <math.h>
19#include <stdio.h>
20#include <string.h>
21#include <time.h>
22#include "../include/mystdlib.h"
23#include "../include/ngram.h"
24
25namespace ime_pinyin {
26
27#define ADD_COUNT 0.3
28
29int comp_double(const void *p1, const void *p2) {
30  if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2))
31    return -1;
32  if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2))
33    return 1;
34  return 0;
35}
36
37inline double distance(double freq, double code) {
38  // return fabs(freq - code);
39  return freq * fabs(log(freq) - log(code));
40}
41
42// Find the index of the code value which is nearest to the given freq
43int qsearch_nearest(double code_book[], double freq, int start, int end) {
44  if (start == end)
45    return start;
46
47  if (start + 1 == end) {
48    if (distance(freq, code_book[end]) > distance(freq, code_book[start]))
49      return start;
50    return end;
51  }
52
53  int mid = (start + end) / 2;
54
55  if (code_book[mid] > freq)
56    return qsearch_nearest(code_book, freq, start, mid);
57  else
58    return qsearch_nearest(code_book, freq, mid, end);
59}
60
61size_t update_code_idx(double freqs[], size_t num, double code_book[],
62                       CODEBOOK_TYPE *code_idx) {
63  size_t changed = 0;
64  for (size_t pos = 0; pos < num; pos++) {
65    CODEBOOK_TYPE idx;
66    idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
67    if (idx != code_idx[pos])
68      changed++;
69    code_idx[pos] = idx;
70  }
71  return changed;
72}
73
74double recalculate_kernel(double freqs[], size_t num, double code_book[],
75                          CODEBOOK_TYPE *code_idx) {
76  double ret = 0;
77
78  size_t *item_num =  new size_t[kCodeBookSize];
79  assert(item_num);
80  memset(item_num, 0, sizeof(size_t) * kCodeBookSize);
81
82  double *cb_new = new double[kCodeBookSize];
83  assert(cb_new);
84  memset(cb_new, 0, sizeof(double) * kCodeBookSize);
85
86  for (size_t pos = 0; pos < num; pos++) {
87    ret += distance(freqs[pos], code_book[code_idx[pos]]);
88
89    cb_new[code_idx[pos]] += freqs[pos];
90    item_num[code_idx[pos]] += 1;
91  }
92
93  for (size_t code = 0; code < kCodeBookSize; code++) {
94    assert(item_num[code] > 0);
95    code_book[code] = cb_new[code] / item_num[code];
96  }
97
98  delete [] item_num;
99  delete [] cb_new;
100
101  return ret;
102}
103
104void iterate_codes(double freqs[], size_t num, double code_book[],
105                   CODEBOOK_TYPE *code_idx) {
106  size_t iter_num = 0;
107  double delta_last = 0;
108  do {
109    size_t changed = update_code_idx(freqs, num, code_book, code_idx);
110
111    double delta = recalculate_kernel(freqs, num, code_book, code_idx);
112
113    if (kPrintDebug0) {
114      printf("---Unigram codebook iteration: %d : %d, %.9f\n",
115             iter_num, changed, delta);
116    }
117    iter_num++;
118
119    if (iter_num > 1 &&
120        (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001))
121      break;
122    delta_last = delta;
123  } while (true);
124}
125
126
127NGram* NGram::instance_ = NULL;
128
129NGram::NGram() {
130  initialized_ = false;
131  idx_num_ = 0;
132  lma_freq_idx_ = NULL;
133  sys_score_compensation_ = 0;
134
135#ifdef ___BUILD_MODEL___
136  freq_codes_df_ = NULL;
137#endif
138  freq_codes_ = NULL;
139}
140
141NGram::~NGram() {
142  if (NULL != lma_freq_idx_)
143    free(lma_freq_idx_);
144
145#ifdef ___BUILD_MODEL___
146  if (NULL != freq_codes_df_)
147    free(freq_codes_df_);
148#endif
149
150  if (NULL != freq_codes_)
151    free(freq_codes_);
152}
153
154NGram& NGram::get_instance() {
155  if (NULL == instance_)
156    instance_ = new NGram();
157  return *instance_;
158}
159
160bool NGram::save_ngram(FILE *fp) {
161  if (!initialized_ || NULL == fp)
162    return false;
163
164  if (0 == idx_num_ || NULL == freq_codes_ ||  NULL == lma_freq_idx_)
165    return false;
166
167  if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1)
168    return false;
169
170  if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
171      kCodeBookSize)
172    return false;
173
174  if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
175    return false;
176
177  return true;
178}
179
180bool NGram::load_ngram(FILE *fp) {
181  if (NULL == fp)
182    return false;
183
184  initialized_ = false;
185
186  if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 )
187    return false;
188
189  if (NULL != lma_freq_idx_)
190    free(lma_freq_idx_);
191
192  if (NULL != freq_codes_)
193    free(freq_codes_);
194
195  lma_freq_idx_ = static_cast<CODEBOOK_TYPE*>
196                  (malloc(idx_num_ * sizeof(CODEBOOK_TYPE)));
197  freq_codes_ = static_cast<LmaScoreType*>
198      (malloc(kCodeBookSize * sizeof(LmaScoreType)));
199
200  if (NULL == lma_freq_idx_ || NULL == freq_codes_)
201    return false;
202
203  if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
204      kCodeBookSize)
205    return false;
206
207  if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_)
208    return false;
209
210  initialized_ = true;
211
212  total_freq_none_sys_ = 0;
213  return true;
214}
215
216void NGram::set_total_freq_none_sys(size_t freq_none_sys) {
217  total_freq_none_sys_ = freq_none_sys;
218  if (0 == total_freq_none_sys_) {
219    sys_score_compensation_ = 0;
220  } else {
221    double factor = static_cast<double>(kSysDictTotalFreq) / (
222        kSysDictTotalFreq + total_freq_none_sys_);
223    sys_score_compensation_ = static_cast<float>(
224        log(factor) * kLogValueAmplifier);
225  }
226}
227
228// The caller makes sure this oject is initialized.
229float NGram::get_uni_psb(LemmaIdType lma_id) {
230  return  static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) +
231      sys_score_compensation_;
232}
233
234float NGram::convert_psb_to_score(double psb) {
235  float score = static_cast<float>(
236      log(psb) * static_cast<double>(kLogValueAmplifier));
237  if (score > static_cast<float>(kMaxScore)) {
238    score = static_cast<float>(kMaxScore);
239  }
240  return score;
241}
242
243#ifdef ___BUILD_MODEL___
244bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
245                          LemmaIdType next_idx_unused) {
246  if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1)
247    return false;
248
249  double total_freq = 0;
250  double *freqs = new double[next_idx_unused];
251  if (NULL == freqs)
252    return false;
253
254  freqs[0] = ADD_COUNT;
255  total_freq += freqs[0];
256  LemmaIdType idx_now = 0;
257  for (size_t pos = 0; pos < lemma_num; pos++) {
258    if (lemma_arr[pos].idx_by_hz == idx_now)
259      continue;
260    idx_now++;
261
262    assert(lemma_arr[pos].idx_by_hz == idx_now);
263
264    freqs[idx_now] = lemma_arr[pos].freq;
265    if (freqs[idx_now] <= 0)
266      freqs[idx_now] = 0.3;
267
268    total_freq += freqs[idx_now];
269  }
270
271  double max_freq = 0;
272  idx_num_ = idx_now + 1;
273  assert(idx_now + 1 == next_idx_unused);
274
275  for (size_t pos = 0; pos < idx_num_; pos++) {
276    freqs[pos] = freqs[pos] / total_freq;
277    assert(freqs[pos] > 0);
278    if (freqs[pos] > max_freq)
279      max_freq = freqs[pos];
280  }
281
282  // calculate the code book
283  if (NULL == freq_codes_df_)
284    freq_codes_df_ = new double[kCodeBookSize];
285  assert(freq_codes_df_);
286  memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize);
287
288  if (NULL == freq_codes_)
289    freq_codes_ = new LmaScoreType[kCodeBookSize];
290  assert(freq_codes_);
291  memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize);
292
293  size_t freq_pos = 0;
294  for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
295    bool found = true;
296
297    while (found) {
298      found = false;
299      double cand = freqs[freq_pos];
300      for (size_t i = 0; i < code_pos; i++)
301        if (freq_codes_df_[i] == cand) {
302          found = true;
303          break;
304        }
305      if (found)
306        freq_pos++;
307    }
308
309    freq_codes_df_[code_pos] = freqs[freq_pos];
310    freq_pos++;
311  }
312
313  myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);
314
315  if (NULL == lma_freq_idx_)
316    lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_];
317  assert(lma_freq_idx_);
318
319  iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);
320
321  delete [] freqs;
322
323  if (kPrintDebug0) {
324    printf("\n------Language Model Unigram Codebook------\n");
325  }
326
327  for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
328    double log_score = log(freq_codes_df_[code_pos]);
329    float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
330    if (kPrintDebug0) {
331      printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n",
332             code_pos, freq_codes_df_[code_pos], log_score, final_score);
333    }
334    freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
335  }
336
337  initialized_ = true;
338  return true;
339}
340#endif
341
342}  // namespace ime_pinyin
343