1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <assert.h>
18#include <stdlib.h>
19#include <stdio.h>
20#include <string.h>
21#include <math.h>
22#include "../include/spellingtable.h"
23
24namespace ime_pinyin {
25
26#ifdef ___BUILD_MODEL___
27
28const char SpellingTable::
29    kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
30
31// "" is the biggest, so that all empty strings will be moved to the end
32// _eb mean empty is biggest
33int compare_raw_spl_eb(const void* p1, const void* p2) {
34  if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
35    return 1;
36
37  if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
38    return -1;
39
40  return strcmp((static_cast<const RawSpelling*>(p1))->str,
41                (static_cast<const RawSpelling*>(p2))->str);
42}
43
44size_t get_odd_next(size_t value) {
45  size_t v_next = value;
46  while (true) {
47    size_t v_next_sqrt = (size_t)sqrt(v_next);
48
49    bool is_odd = true;
50    for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
51      if (v_next % v_dv == 0) {
52        is_odd = false;
53        break;
54      }
55    }
56
57    if (is_odd)
58      return v_next;
59
60    v_next++;
61  }
62
63  // never reach here
64  return 0;
65}
66
67SpellingTable::SpellingTable() {
68  need_score_ = false;
69  raw_spellings_ = NULL;
70  spelling_buf_ = NULL;
71  spelling_num_ = 0;
72  total_freq_ = 0;
73  frozen_ = true;
74}
75
76SpellingTable::~SpellingTable() {
77  free_resource();
78}
79
80size_t SpellingTable::get_hash_pos(const char* spelling_str) {
81  size_t hash_pos = 0;
82  for (size_t pos = 0; pos < spelling_size_; pos++) {
83    if ('\0' == spelling_str[pos])
84      break;
85    hash_pos += (size_t)spelling_str[pos];
86  }
87
88  hash_pos = hash_pos % spelling_max_num_;
89  return hash_pos;
90}
91
92size_t SpellingTable::hash_pos_next(size_t hash_pos) {
93  hash_pos += 123;
94  hash_pos = hash_pos % spelling_max_num_;
95  return hash_pos;
96}
97
98void SpellingTable::free_resource() {
99  if (NULL != raw_spellings_)
100    delete [] raw_spellings_;
101  raw_spellings_ = NULL;
102
103  if (NULL != spelling_buf_)
104    delete [] spelling_buf_;
105  spelling_buf_ = NULL;
106}
107
108bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num,
109                               bool need_score) {
110  if (pure_spl_size == 0 || spl_max_num ==0)
111    return false;
112
113  need_score_ = need_score;
114
115  free_resource();
116
117  spelling_size_ = pure_spl_size + 1;
118  if (need_score)
119    spelling_size_ += 1;
120  spelling_max_num_ = get_odd_next(spl_max_num);
121  spelling_num_ = 0;
122
123  raw_spellings_ = new RawSpelling[spelling_max_num_];
124  spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
125  if (NULL == raw_spellings_ || NULL == spelling_buf_) {
126    free_resource();
127    return false;
128  }
129
130  memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
131  memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
132  frozen_ = false;
133  total_freq_ = 0;
134  return true;
135}
136
137bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
138  if (frozen_ || NULL == spelling_str)
139    return false;
140
141  for (size_t pos = 0; pos < kNotSupportNum; pos++) {
142    if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
143      return false;
144    }
145  }
146
147  total_freq_ += freq;
148
149  size_t hash_pos = get_hash_pos(spelling_str);
150
151  raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
152
153  if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
154              spelling_size_ - 1) == 0) {
155    raw_spellings_[hash_pos].freq += freq;
156    return true;
157  }
158
159  size_t hash_pos_ori = hash_pos;
160
161  while (true) {
162    if (strncmp(raw_spellings_[hash_pos].str,
163                spelling_str, spelling_size_ - 1) == 0) {
164      raw_spellings_[hash_pos].freq += freq;
165      return true;
166    }
167
168    if ('\0' == raw_spellings_[hash_pos].str[0]) {
169      raw_spellings_[hash_pos].freq += freq;
170      strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
171      raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
172      spelling_num_++;
173      return true;
174    }
175
176    hash_pos = hash_pos_next(hash_pos);
177    if (hash_pos_ori == hash_pos)
178      return false;
179  }
180
181  // never reach here
182  return false;
183}
184
185bool SpellingTable::contain(const char* spelling_str) {
186  if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
187    return false;
188
189  size_t hash_pos = get_hash_pos(spelling_str);
190
191  if ('\0' == raw_spellings_[hash_pos].str[0])
192    return false;
193
194  if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
195      == 0)
196    return true;
197
198  size_t hash_pos_ori = hash_pos;
199
200  while (true) {
201    hash_pos = hash_pos_next(hash_pos);
202    if (hash_pos_ori == hash_pos)
203      return false;
204
205    if ('\0' == raw_spellings_[hash_pos].str[0])
206      return false;
207
208    if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
209        == 0)
210      return true;
211  }
212
213  // never reach here
214  return false;
215}
216
217const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) {
218  if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
219      NULL == item_size || NULL == spl_num)
220    return NULL;
221
222  qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
223        compare_raw_spl_eb);
224
225  // After sorting, only the first spelling_num_ items are valid.
226  // Copy them to the destination buffer.
227  for (size_t pos = 0; pos < spelling_num_; pos++) {
228    strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
229            spelling_size_);
230  }
231
232  if (need_score_) {
233    if (kPrintDebug0)
234      printf("------------Spelling Possiblities--------------\n");
235
236    double max_score = 0;
237    double min_score = 0;
238
239    // After sorting, only the first spelling_num_ items are valid.
240    for (size_t pos = 0; pos < spelling_num_; pos++) {
241      raw_spellings_[pos].freq /= total_freq_;
242      if (need_score_) {
243        if (0 == pos) {
244          max_score = raw_spellings_[0].freq;
245          min_score = max_score;
246        } else {
247          if (raw_spellings_[pos].freq > max_score)
248            max_score = raw_spellings_[pos].freq;
249          if (raw_spellings_[pos].freq < min_score)
250            min_score = raw_spellings_[pos].freq;
251        }
252      }
253    }
254
255    if (kPrintDebug0)
256      printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
257
258    max_score = log(max_score);
259    min_score = log(min_score);
260
261    if (kPrintDebug0)
262      printf("-----max log value: %f, min log value: %f\n",
263             max_score, min_score);
264
265    // The absolute value of min_score is bigger than that of max_score because
266    // both of them are negative after log function.
267    score_amplifier_ = 1.0 * 255 / min_score;
268
269    double average_score = 0;
270    for (size_t pos = 0; pos < spelling_num_; pos++) {
271      double score = log(raw_spellings_[pos].freq) * score_amplifier_;
272      assert(score >= 0);
273
274      average_score += score;
275
276      // Because of calculation precision issue, score might be a little bigger
277      // than 255 after being amplified.
278      if (score > 255)
279        score = 255;
280      char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
281      this_spl_buf[spelling_size_ - 1] =
282          static_cast<char>((unsigned char)score);
283
284      if (kPrintDebug0) {
285        printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
286               (unsigned char)this_spl_buf[spelling_size_ -1]);
287      }
288    }
289    average_score /= spelling_num_;
290    assert(average_score <= 255);
291    average_score_ = static_cast<uint8>(average_score);
292
293    if (kPrintDebug0)
294      printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
295             average_score_);
296  }
297
298  *item_size = spelling_size_;
299  *spl_num = spelling_num_;
300  frozen_ = true;
301  return spelling_buf_;
302}
303
304float SpellingTable::get_score_amplifier() {
305  return static_cast<float>(score_amplifier_);
306}
307
308unsigned char SpellingTable::get_average_score() {
309  return average_score_;
310}
311
312#endif  // ___BUILD_MODEL___
313}  // namespace ime_pinyin
314