1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <assert.h>
18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21
22#include "../include/dictbuilder.h"
23#include "../include/dicttrie.h"
24#include "../include/mystdlib.h"
25#include "../include/ngram.h"
26#include "../include/searchutility.h"
27#include "../include/spellingtable.h"
28#include "../include/spellingtrie.h"
29#include "../include/splparser.h"
30#include "../include/utf16reader.h"
31
32namespace ime_pinyin {
33
34#ifdef ___BUILD_MODEL___
35
36static const size_t kReadBufLen = 512;
37static const size_t kSplTableHashLen = 2000;
38
39// Compare a SingleCharItem, first by Hanzis, then by spelling ids, then by
40// frequencies.
41int cmp_scis_hz_splid_freq(const void* p1, const void* p2) {
42  const SingleCharItem *s1, *s2;
43  s1 = static_cast<const SingleCharItem*>(p1);
44  s2 = static_cast<const SingleCharItem*>(p2);
45
46  if (s1->hz < s2->hz)
47    return -1;
48  if (s1->hz > s2->hz)
49    return 1;
50
51  if (s1->splid.half_splid < s2->splid.half_splid)
52    return -1;
53  if (s1->splid.half_splid > s2->splid.half_splid)
54    return 1;
55
56  if (s1->splid.full_splid < s2->splid.full_splid)
57    return -1;
58  if (s1->splid.full_splid > s2->splid.full_splid)
59    return 1;
60
61  if (s1->freq > s2->freq)
62    return -1;
63  if (s1->freq < s2->freq)
64    return 1;
65  return 0;
66}
67
68int cmp_scis_hz_splid(const void* p1, const void* p2) {
69  const SingleCharItem *s1, *s2;
70  s1 = static_cast<const SingleCharItem*>(p1);
71  s2 = static_cast<const SingleCharItem*>(p2);
72
73  if (s1->hz < s2->hz)
74    return -1;
75  if (s1->hz > s2->hz)
76    return 1;
77
78  if (s1->splid.half_splid < s2->splid.half_splid)
79    return -1;
80  if (s1->splid.half_splid > s2->splid.half_splid)
81    return 1;
82
83  if (s1->splid.full_splid < s2->splid.full_splid)
84    return -1;
85  if (s1->splid.full_splid > s2->splid.full_splid)
86    return 1;
87
88  return 0;
89}
90
91int cmp_lemma_entry_hzs(const void* p1, const void* p2) {
92  size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
93  size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
94  if (size1 < size2)
95    return -1;
96  else if (size1 > size2)
97    return 1;
98
99  return utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
100                      ((const LemmaEntry*)p2)->hanzi_str);
101}
102
103int compare_char16(const void* p1, const void* p2) {
104  if (*((const char16*)p1) < *((const char16*)p2))
105    return -1;
106  if (*((const char16*)p1) > *((const char16*)p2))
107    return 1;
108  return 0;
109}
110
111int compare_py(const void* p1, const void* p2) {
112  int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
113                         ((const LemmaEntry*)p2)->spl_idx_arr);
114
115  if (0 != ret)
116    return ret;
117
118  return static_cast<int>(((const LemmaEntry*)p2)->freq) -
119         static_cast<int>(((const LemmaEntry*)p1)->freq);
120}
121
122// First hanzi, if the same, then Pinyin
123int cmp_lemma_entry_hzspys(const void* p1, const void* p2) {
124  size_t size1 = utf16_strlen(((const LemmaEntry*)p1)->hanzi_str);
125  size_t size2 = utf16_strlen(((const LemmaEntry*)p2)->hanzi_str);
126  if (size1 < size2)
127    return -1;
128  else if (size1 > size2)
129    return 1;
130  int ret = utf16_strcmp(((const LemmaEntry*)p1)->hanzi_str,
131                         ((const LemmaEntry*)p2)->hanzi_str);
132
133  if (0 != ret)
134    return ret;
135
136  ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
137                     ((const LemmaEntry*)p2)->spl_idx_arr);
138  return ret;
139}
140
141int compare_splid2(const void* p1, const void* p2) {
142  int ret = utf16_strcmp(((const LemmaEntry*)p1)->spl_idx_arr,
143                         ((const LemmaEntry*)p2)->spl_idx_arr);
144  return ret;
145}
146
147DictBuilder::DictBuilder() {
148  lemma_arr_ = NULL;
149  lemma_num_ = 0;
150
151  scis_ = NULL;
152  scis_num_ = 0;
153
154  lma_nodes_le0_ = NULL;
155  lma_nodes_ge1_ = NULL;
156
157  lma_nds_used_num_le0_ = 0;
158  lma_nds_used_num_ge1_ = 0;
159
160  homo_idx_buf_ = NULL;
161  homo_idx_num_eq1_ = 0;
162  homo_idx_num_gt1_ = 0;
163
164  top_lmas_ = NULL;
165  top_lmas_num_ = 0;
166
167  spl_table_ = NULL;
168  spl_parser_ = NULL;
169}
170
171DictBuilder::~DictBuilder() {
172  free_resource();
173}
174
175bool DictBuilder::alloc_resource(size_t lma_num) {
176  if (0 == lma_num)
177    return false;
178
179  free_resource();
180
181  lemma_num_ = lma_num;
182  lemma_arr_ = new LemmaEntry[lemma_num_];
183
184  top_lmas_num_ = 0;
185  top_lmas_ = new LemmaEntry[kTopScoreLemmaNum];
186
187  // New the scis_ buffer to the possible maximum size.
188  scis_num_ = lemma_num_ * kMaxLemmaSize;
189  scis_ = new SingleCharItem[scis_num_];
190
191  // The root and first level nodes is less than kMaxSpellingNum + 1
192  lma_nds_used_num_le0_ = 0;
193  lma_nodes_le0_ = new LmaNodeLE0[kMaxSpellingNum + 1];
194
195  // Other nodes is less than lemma_num
196  lma_nds_used_num_ge1_ = 0;
197  lma_nodes_ge1_ = new LmaNodeGE1[lemma_num_];
198
199  homo_idx_buf_ = new LemmaIdType[lemma_num_];
200  spl_table_ = new SpellingTable();
201  spl_parser_ = new SpellingParser();
202
203  if (NULL == lemma_arr_ || NULL == top_lmas_ ||
204      NULL == scis_ || NULL == spl_table_ ||
205      NULL == spl_parser_ || NULL == lma_nodes_le0_ ||
206      NULL == lma_nodes_ge1_ || NULL == homo_idx_buf_) {
207    free_resource();
208    return false;
209  }
210
211  memset(lemma_arr_, 0, sizeof(LemmaEntry) * lemma_num_);
212  memset(scis_, 0, sizeof(SingleCharItem) * scis_num_);
213  memset(lma_nodes_le0_, 0, sizeof(LmaNodeLE0) * (kMaxSpellingNum + 1));
214  memset(lma_nodes_ge1_, 0, sizeof(LmaNodeGE1) * lemma_num_);
215  memset(homo_idx_buf_, 0, sizeof(LemmaIdType) * lemma_num_);
216  spl_table_->init_table(kMaxPinyinSize, kSplTableHashLen, true);
217
218  return true;
219}
220
221char16* DictBuilder::read_valid_hanzis(const char *fn_validhzs, size_t *num) {
222  if (NULL == fn_validhzs || NULL == num)
223    return NULL;
224
225  *num = 0;
226  FILE *fp = fopen(fn_validhzs, "rb");
227  if (NULL == fp)
228    return NULL;
229
230  char16 utf16header;
231  if (fread(&utf16header, sizeof(char16), 1, fp) != 1 ||
232      0xfeff != utf16header) {
233    fclose(fp);
234    return NULL;
235  }
236
237  fseek(fp, 0, SEEK_END);
238  *num = ftell(fp) / sizeof(char16);
239  assert(*num >= 1);
240  *num -= 1;
241
242  char16 *hzs = new char16[*num];
243  if (NULL == hzs) {
244    fclose(fp);
245    return NULL;
246  }
247
248  fseek(fp, 2, SEEK_SET);
249
250  if (fread(hzs, sizeof(char16), *num, fp) != *num) {
251    fclose(fp);
252    delete [] hzs;
253    return NULL;
254  }
255  fclose(fp);
256
257  myqsort(hzs, *num, sizeof(char16), compare_char16);
258  return hzs;
259}
260
261bool DictBuilder::hz_in_hanzis_list(const char16 *hzs, size_t hzs_len,
262                                    char16 hz) {
263  if (NULL == hzs)
264    return false;
265
266  char16 *found;
267  found = static_cast<char16*>(
268      mybsearch(&hz, hzs, hzs_len, sizeof(char16), compare_char16));
269  if (NULL == found)
270    return false;
271
272  assert(*found == hz);
273  return true;
274}
275
276// The caller makes sure that the parameters are valid.
277bool DictBuilder::str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
278                                     const char16 *str, size_t str_len) {
279  if (NULL == hzs || NULL == str)
280    return false;
281
282  for (size_t pos = 0; pos < str_len; pos++) {
283    if (!hz_in_hanzis_list(hzs, hzs_len, str[pos]))
284      return false;
285  }
286  return true;
287}
288
289void DictBuilder::get_top_lemmas() {
290  top_lmas_num_ = 0;
291  if (NULL == lemma_arr_)
292    return;
293
294  for (size_t pos = 0; pos < lemma_num_; pos++) {
295    if (0 == top_lmas_num_) {
296      top_lmas_[0] = lemma_arr_[pos];
297      top_lmas_num_ = 1;
298      continue;
299    }
300
301    if (lemma_arr_[pos].freq > top_lmas_[top_lmas_num_ - 1].freq) {
302      if (kTopScoreLemmaNum > top_lmas_num_)
303        top_lmas_num_ += 1;
304
305      size_t move_pos;
306      for (move_pos = top_lmas_num_ - 1; move_pos > 0; move_pos--) {
307        top_lmas_[move_pos] = top_lmas_[move_pos - 1];
308        if (0 == move_pos - 1 ||
309            (move_pos - 1 > 0 &&
310             top_lmas_[move_pos - 2].freq > lemma_arr_[pos].freq)) {
311          break;
312        }
313      }
314      assert(move_pos > 0);
315      top_lmas_[move_pos - 1] = lemma_arr_[pos];
316    } else if (kTopScoreLemmaNum > top_lmas_num_) {
317      top_lmas_[top_lmas_num_] = lemma_arr_[pos];
318      top_lmas_num_ += 1;
319    }
320  }
321
322  if (kPrintDebug0) {
323    printf("\n------Top Lemmas------------------\n");
324    for (size_t pos = 0; pos < top_lmas_num_; pos++) {
325      printf("--%d, idx:%06d, score:%.5f\n", pos, top_lmas_[pos].idx_by_hz,
326             top_lmas_[pos].freq);
327    }
328  }
329}
330
331void DictBuilder::free_resource() {
332  if (NULL != lemma_arr_)
333    delete [] lemma_arr_;
334
335  if (NULL != scis_)
336    delete [] scis_;
337
338  if (NULL != lma_nodes_le0_)
339    delete [] lma_nodes_le0_;
340
341  if (NULL != lma_nodes_ge1_)
342    delete [] lma_nodes_ge1_;
343
344  if (NULL != homo_idx_buf_)
345    delete [] homo_idx_buf_;
346
347  if (NULL != spl_table_)
348    delete spl_table_;
349
350  if (NULL != spl_parser_)
351    delete spl_parser_;
352
353  lemma_arr_ = NULL;
354  scis_ = NULL;
355  lma_nodes_le0_ = NULL;
356  lma_nodes_ge1_ = NULL;
357  homo_idx_buf_ = NULL;
358  spl_table_ = NULL;
359  spl_parser_ = NULL;
360
361  lemma_num_ = 0;
362  lma_nds_used_num_le0_ = 0;
363  lma_nds_used_num_ge1_ = 0;
364  homo_idx_num_eq1_ = 0;
365  homo_idx_num_gt1_ = 0;
366}
367
368size_t DictBuilder::read_raw_dict(const char* fn_raw,
369                                  const char *fn_validhzs,
370                                  size_t max_item) {
371  if (NULL == fn_raw) return 0;
372
373  Utf16Reader utf16_reader;
374  if (!utf16_reader.open(fn_raw, kReadBufLen * 10))
375    return false;
376
377  char16 read_buf[kReadBufLen];
378
379  // Read the number of lemmas in the file
380  size_t lemma_num = 240000;
381
382  // allocate resource required
383  if (!alloc_resource(lemma_num)) {
384    utf16_reader.close();
385  }
386
387  // Read the valid Hanzi list.
388  char16 *valid_hzs = NULL;
389  size_t valid_hzs_num = 0;
390  valid_hzs = read_valid_hanzis(fn_validhzs, &valid_hzs_num);
391
392  // Begin reading the lemma entries
393  for (size_t i = 0; i < max_item; i++) {
394    // read next entry
395    if (!utf16_reader.readline(read_buf, kReadBufLen)) {
396      lemma_num = i;
397      break;
398    }
399
400    size_t token_size;
401    char16 *token;
402    char16 *to_tokenize = read_buf;
403
404    // Get the Hanzi string
405    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
406    if (NULL == token) {
407      free_resource();
408      utf16_reader.close();
409      return false;
410    }
411
412    size_t lemma_size = utf16_strlen(token);
413
414    if (lemma_size > kMaxLemmaSize) {
415      i--;
416      continue;
417    }
418
419    if (lemma_size > 4) {
420      i--;
421      continue;
422    }
423
424    // Copy to the lemma entry
425    utf16_strcpy(lemma_arr_[i].hanzi_str, token);
426
427    lemma_arr_[i].hz_str_len = token_size;
428
429    // Get the freq string
430    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
431    if (NULL == token) {
432      free_resource();
433      utf16_reader.close();
434      return false;
435    }
436    lemma_arr_[i].freq = utf16_atof(token);
437
438    if (lemma_size > 1 && lemma_arr_[i].freq < 60) {
439      i--;
440      continue;
441    }
442
443    // Get GBK mark, if no valid Hanzi list available, all items which contains
444    // GBK characters will be discarded. Otherwise, all items which contains
445    // characters outside of the valid Hanzi list will be discarded.
446    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
447    assert(NULL != token);
448    int gbk_flag = utf16_atoi(token);
449    if (NULL == valid_hzs || 0 == valid_hzs_num) {
450      if (0 != gbk_flag) {
451        i--;
452        continue;
453      }
454    } else {
455      if (!str_in_hanzis_list(valid_hzs, valid_hzs_num,
456          lemma_arr_[i].hanzi_str, lemma_arr_[i].hz_str_len)) {
457        i--;
458        continue;
459      }
460    }
461
462    // Get spelling String
463    bool spelling_not_support = false;
464    for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
465         hz_pos++) {
466      // Get a Pinyin
467      token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
468      if (NULL == token) {
469        free_resource();
470        utf16_reader.close();
471        return false;
472      }
473
474      assert(utf16_strlen(token) <= kMaxPinyinSize);
475
476      utf16_strcpy_tochar(lemma_arr_[i].pinyin_str[hz_pos], token);
477
478      format_spelling_str(lemma_arr_[i].pinyin_str[hz_pos]);
479
480      // Put the pinyin to the spelling table
481      if (!spl_table_->put_spelling(lemma_arr_[i].pinyin_str[hz_pos],
482                                    lemma_arr_[i].freq)) {
483        spelling_not_support = true;
484        break;
485      }
486    }
487
488    // The whole line must have been parsed fully, otherwise discard this one.
489    token = utf16_strtok(to_tokenize, &token_size, &to_tokenize);
490    if (spelling_not_support || NULL != token) {
491      i--;
492      continue;
493    }
494  }
495
496  delete [] valid_hzs;
497  utf16_reader.close();
498
499  printf("read succesfully, lemma num: %d\n", lemma_num);
500
501  return lemma_num;
502}
503
504bool DictBuilder::build_dict(const char *fn_raw,
505                             const char *fn_validhzs,
506                             DictTrie *dict_trie) {
507  if (NULL == fn_raw || NULL == dict_trie)
508    return false;
509
510  lemma_num_ = read_raw_dict(fn_raw, fn_validhzs, 240000);
511  if (0 == lemma_num_)
512    return false;
513
514  // Arrange the spelling table, and build a spelling tree
515  // The size of an spelling. '\0' is included. If the spelling table is
516  // initialized to calculate the spelling scores, the last char in the
517  // spelling string will be score, and it is also included in spl_item_size.
518  size_t spl_item_size;
519  size_t spl_num;
520  const char* spl_buf;
521  spl_buf = spl_table_->arrange(&spl_item_size, &spl_num);
522  if (NULL == spl_buf) {
523    free_resource();
524    return false;
525  }
526
527  SpellingTrie &spl_trie = SpellingTrie::get_instance();
528
529  if (!spl_trie.construct(spl_buf, spl_item_size, spl_num,
530                          spl_table_->get_score_amplifier(),
531                          spl_table_->get_average_score())) {
532    free_resource();
533    return false;
534  }
535
536  printf("spelling tree construct successfully.\n");
537
538  // Convert the spelling string to idxs
539  for (size_t i = 0; i < lemma_num_; i++) {
540    for (size_t hz_pos = 0; hz_pos < (size_t)lemma_arr_[i].hz_str_len;
541         hz_pos++) {
542      uint16 spl_idxs[2];
543      uint16 spl_start_pos[3];
544      bool is_pre = true;
545      int spl_idx_num =
546        spl_parser_->splstr_to_idxs(lemma_arr_[i].pinyin_str[hz_pos],
547                                    strlen(lemma_arr_[i].pinyin_str[hz_pos]),
548                                    spl_idxs, spl_start_pos, 2, is_pre);
549      assert(1 == spl_idx_num);
550
551      if (spl_trie.is_half_id(spl_idxs[0])) {
552        uint16 num = spl_trie.half_to_full(spl_idxs[0], spl_idxs);
553        assert(0 != num);
554      }
555      lemma_arr_[i].spl_idx_arr[hz_pos] = spl_idxs[0];
556    }
557  }
558
559  // Sort the lemma items according to the hanzi, and give each unique item a
560  // id
561  sort_lemmas_by_hz();
562
563  scis_num_ = build_scis();
564
565  // Construct the dict list
566  dict_trie->dict_list_ = new DictList();
567  bool dl_success = dict_trie->dict_list_->init_list(scis_, scis_num_,
568                                                     lemma_arr_, lemma_num_);
569  assert(dl_success);
570
571  // Construct the NGram information
572  NGram& ngram = NGram::get_instance();
573  ngram.build_unigram(lemma_arr_, lemma_num_,
574                      lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
575
576  // sort the lemma items according to the spelling idx string
577  myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);
578
579  get_top_lemmas();
580
581#ifdef ___DO_STATISTICS___
582  stat_init();
583#endif
584
585  lma_nds_used_num_le0_ = 1;  // The root node
586  bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
587                                     lemma_arr_, 0, lemma_num_, 0);
588  if (!dt_success) {
589    free_resource();
590    return false;
591  }
592
593#ifdef ___DO_STATISTICS___
594  stat_print();
595#endif
596
597  // Move the node data and homo data to the DictTrie
598  dict_trie->root_ = new LmaNodeLE0[lma_nds_used_num_le0_];
599  dict_trie->nodes_ge1_ = new LmaNodeGE1[lma_nds_used_num_ge1_];
600  size_t lma_idx_num = homo_idx_num_eq1_ + homo_idx_num_gt1_ + top_lmas_num_;
601  dict_trie->lma_idx_buf_ = new unsigned char[lma_idx_num * kLemmaIdSize];
602  assert(NULL != dict_trie->root_);
603  assert(NULL != dict_trie->lma_idx_buf_);
604  dict_trie->lma_node_num_le0_ = lma_nds_used_num_le0_;
605  dict_trie->lma_node_num_ge1_ = lma_nds_used_num_ge1_;
606  dict_trie->lma_idx_buf_len_ = lma_idx_num * kLemmaIdSize;
607  dict_trie->top_lmas_num_ = top_lmas_num_;
608
609  memcpy(dict_trie->root_, lma_nodes_le0_,
610         sizeof(LmaNodeLE0) * lma_nds_used_num_le0_);
611  memcpy(dict_trie->nodes_ge1_, lma_nodes_ge1_,
612         sizeof(LmaNodeGE1) * lma_nds_used_num_ge1_);
613
614  for (size_t pos = 0; pos < homo_idx_num_eq1_ + homo_idx_num_gt1_; pos++) {
615    id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize,
616                  homo_idx_buf_[pos]);
617  }
618
619  for (size_t pos = homo_idx_num_eq1_ + homo_idx_num_gt1_;
620       pos < lma_idx_num; pos++) {
621    LemmaIdType idx =
622        top_lmas_[pos - homo_idx_num_eq1_ - homo_idx_num_gt1_].idx_by_hz;
623    id_to_charbuf(dict_trie->lma_idx_buf_ + pos * kLemmaIdSize, idx);
624  }
625
626  if (kPrintDebug0) {
627    printf("homo_idx_num_eq1_: %d\n", homo_idx_num_eq1_);
628    printf("homo_idx_num_gt1_: %d\n", homo_idx_num_gt1_);
629    printf("top_lmas_num_: %d\n", top_lmas_num_);
630  }
631
632  free_resource();
633
634  if (kPrintDebug0) {
635    printf("Building dict succeds\n");
636  }
637  return dt_success;
638}
639
640void DictBuilder::id_to_charbuf(unsigned char *buf, LemmaIdType id) {
641  if (NULL == buf) return;
642  for (size_t pos = 0; pos < kLemmaIdSize; pos++) {
643    (buf)[pos] = (unsigned char)(id >> (pos * 8));
644  }
645}
646
647void DictBuilder::set_son_offset(LmaNodeGE1 *node, size_t offset) {
648  node->son_1st_off_l = static_cast<uint16>(offset);
649  node->son_1st_off_h = static_cast<unsigned char>(offset >> 16);
650}
651
652void DictBuilder:: set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset) {
653  node->homo_idx_buf_off_l = static_cast<uint16>(offset);
654  node->homo_idx_buf_off_h = static_cast<unsigned char>(offset >> 16);
655
656}
657
658// All spelling strings will be converted to upper case, except that
659// spellings started with "ZH"/"CH"/"SH" will be converted to
660// "Zh"/"Ch"/"Sh"
661void DictBuilder::format_spelling_str(char *spl_str) {
662  if (NULL == spl_str)
663    return;
664
665  uint16 pos = 0;
666  while ('\0' != spl_str[pos]) {
667    if (spl_str[pos] >= 'a' && spl_str[pos] <= 'z')
668      spl_str[pos] = spl_str[pos] - 'a' + 'A';
669
670    if (1 == pos && 'H' == spl_str[pos]) {
671      if ('C' == spl_str[0] || 'S' == spl_str[0] || 'Z' == spl_str[0]) {
672        spl_str[pos] = 'h';
673      }
674    }
675    pos++;
676  }
677}
678
679LemmaIdType DictBuilder::sort_lemmas_by_hz() {
680  if (NULL == lemma_arr_ || 0 == lemma_num_)
681    return 0;
682
683  myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), cmp_lemma_entry_hzs);
684
685  lemma_arr_[0].idx_by_hz = 1;
686  LemmaIdType idx_max = 1;
687  for (size_t i = 1; i < lemma_num_; i++) {
688    if (utf16_strcmp(lemma_arr_[i].hanzi_str, lemma_arr_[i-1].hanzi_str)) {
689      idx_max++;
690      lemma_arr_[i].idx_by_hz = idx_max;
691    } else {
692      idx_max++;
693      lemma_arr_[i].idx_by_hz = idx_max;
694    }
695  }
696  return idx_max + 1;
697}
698
699size_t DictBuilder::build_scis() {
700  if (NULL == scis_ || lemma_num_ * kMaxLemmaSize > scis_num_)
701    return 0;
702
703  SpellingTrie &spl_trie = SpellingTrie::get_instance();
704
705  // This first one is blank, because id 0 is invalid.
706  scis_[0].freq = 0;
707  scis_[0].hz = 0;
708  scis_[0].splid.full_splid = 0;
709  scis_[0].splid.half_splid = 0;
710  scis_num_ = 1;
711
712  // Copy the hanzis to the buffer
713  for (size_t pos = 0; pos < lemma_num_; pos++) {
714    size_t hz_num = lemma_arr_[pos].hz_str_len;
715    for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
716      scis_[scis_num_].hz = lemma_arr_[pos].hanzi_str[hzpos];
717      scis_[scis_num_].splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
718      scis_[scis_num_].splid.half_splid =
719          spl_trie.full_to_half(scis_[scis_num_].splid.full_splid);
720      if (1 == hz_num)
721        scis_[scis_num_].freq = lemma_arr_[pos].freq;
722      else
723        scis_[scis_num_].freq = 0.000001;
724      scis_num_++;
725    }
726  }
727
728  myqsort(scis_, scis_num_, sizeof(SingleCharItem), cmp_scis_hz_splid_freq);
729
730  // Remove repeated items
731  size_t unique_scis_num = 1;
732  for (size_t pos = 1; pos < scis_num_; pos++) {
733    if (scis_[pos].hz == scis_[pos - 1].hz &&
734        scis_[pos].splid.full_splid == scis_[pos - 1].splid.full_splid)
735      continue;
736    scis_[unique_scis_num] = scis_[pos];
737    scis_[unique_scis_num].splid.half_splid =
738        spl_trie.full_to_half(scis_[pos].splid.full_splid);
739    unique_scis_num++;
740  }
741
742  scis_num_ = unique_scis_num;
743
744  // Update the lemma list.
745  for (size_t pos = 0; pos < lemma_num_; pos++) {
746    size_t hz_num = lemma_arr_[pos].hz_str_len;
747    for (size_t hzpos = 0; hzpos < hz_num; hzpos++) {
748      SingleCharItem key;
749      key.hz = lemma_arr_[pos].hanzi_str[hzpos];
750      key.splid.full_splid = lemma_arr_[pos].spl_idx_arr[hzpos];
751      key.splid.half_splid = spl_trie.full_to_half(key.splid.full_splid);
752
753      SingleCharItem *found;
754      found = static_cast<SingleCharItem*>(mybsearch(&key, scis_,
755                                                     unique_scis_num,
756                                                     sizeof(SingleCharItem),
757                                                     cmp_scis_hz_splid));
758
759      assert(found);
760
761      lemma_arr_[pos].hanzi_scis_ids[hzpos] =
762          static_cast<uint16>(found - scis_);
763      lemma_arr_[pos].spl_idx_arr[hzpos] = found->splid.full_splid;
764    }
765  }
766
767  return scis_num_;
768}
769
770bool DictBuilder::construct_subset(void* parent, LemmaEntry* lemma_arr,
771                                   size_t item_start, size_t item_end,
772                                   size_t level) {
773  if (level >= kMaxLemmaSize || item_end <= item_start)
774    return false;
775
776  // 1. Scan for how many sons
777  size_t parent_son_num = 0;
778  // LemmaNode *son_1st = NULL;
779  // parent.num_of_son = 0;
780
781  LemmaEntry *lma_last_start = lemma_arr_ + item_start;
782  uint16 spl_idx_node = lma_last_start->spl_idx_arr[level];
783
784  // Scan for how many sons to be allocaed
785  for (size_t i = item_start + 1; i< item_end; i++) {
786    LemmaEntry *lma_current = lemma_arr + i;
787    uint16 spl_idx_current = lma_current->spl_idx_arr[level];
788    if (spl_idx_current != spl_idx_node) {
789      parent_son_num++;
790      spl_idx_node = spl_idx_current;
791    }
792  }
793  parent_son_num++;
794
795#ifdef ___DO_STATISTICS___
796  // Use to indicate whether all nodes of this layer have no son.
797  bool allson_noson = true;
798
799  assert(level < kMaxLemmaSize);
800  if (parent_son_num > max_sonbuf_len_[level])
801    max_sonbuf_len_[level] = parent_son_num;
802
803  total_son_num_[level] += parent_son_num;
804  total_sonbuf_num_[level] += 1;
805
806  if (parent_son_num == 1)
807    sonbufs_num1_++;
808  else
809    sonbufs_numgt1_++;
810  total_lma_node_num_ += parent_son_num;
811#endif
812
813  // 2. Update the parent's information
814  //    Update the parent's son list;
815  LmaNodeLE0 *son_1st_le0 = NULL;  // only one of le0 or ge1 is used
816  LmaNodeGE1 *son_1st_ge1 = NULL;  // only one of le0 or ge1 is used.
817  if (0 == level) {                 // the parent is root
818    (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
819      lma_nds_used_num_le0_;
820    son_1st_le0 = lma_nodes_le0_ + lma_nds_used_num_le0_;
821    lma_nds_used_num_le0_ += parent_son_num;
822
823    assert(parent_son_num <= 65535);
824    (static_cast<LmaNodeLE0*>(parent))->num_of_son =
825      static_cast<uint16>(parent_son_num);
826  } else if (1 == level) {  // the parent is a son of root
827    (static_cast<LmaNodeLE0*>(parent))->son_1st_off =
828      lma_nds_used_num_ge1_;
829    son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
830    lma_nds_used_num_ge1_ += parent_son_num;
831
832    assert(parent_son_num <= 65535);
833    (static_cast<LmaNodeLE0*>(parent))->num_of_son =
834      static_cast<uint16>(parent_son_num);
835  } else {
836    set_son_offset((static_cast<LmaNodeGE1*>(parent)),
837                   lma_nds_used_num_ge1_);
838    son_1st_ge1 = lma_nodes_ge1_ + lma_nds_used_num_ge1_;
839    lma_nds_used_num_ge1_ += parent_son_num;
840
841    assert(parent_son_num <= 255);
842    (static_cast<LmaNodeGE1*>(parent))->num_of_son =
843      (unsigned char)parent_son_num;
844  }
845
846  // 3. Now begin to construct the son one by one
847  size_t son_pos = 0;
848
849  lma_last_start = lemma_arr_ + item_start;
850  spl_idx_node = lma_last_start->spl_idx_arr[level];
851
852  size_t homo_num = 0;
853  if (lma_last_start->spl_idx_arr[level + 1] == 0)
854    homo_num = 1;
855
856  size_t item_start_next = item_start;
857
858  for (size_t i = item_start + 1; i < item_end; i++) {
859    LemmaEntry* lma_current = lemma_arr_ + i;
860    uint16 spl_idx_current = lma_current->spl_idx_arr[level];
861
862    if (spl_idx_current == spl_idx_node) {
863      if (lma_current->spl_idx_arr[level + 1] == 0)
864        homo_num++;
865    } else {
866      // Construct a node
867      LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
868      LmaNodeGE1 *node_cur_ge1 = NULL;
869      if (0 == level) {
870        node_cur_le0 = son_1st_le0 + son_pos;
871        node_cur_le0->spl_idx = spl_idx_node;
872        node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
873        node_cur_le0->son_1st_off = 0;
874        homo_idx_num_eq1_ += homo_num;
875      } else {
876        node_cur_ge1 = son_1st_ge1 + son_pos;
877        node_cur_ge1->spl_idx = spl_idx_node;
878
879        set_homo_id_buf_offset(node_cur_ge1,
880                               (homo_idx_num_eq1_ + homo_idx_num_gt1_));
881        set_son_offset(node_cur_ge1, 0);
882        homo_idx_num_gt1_ += homo_num;
883      }
884
885      if (homo_num > 0) {
886        LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
887              homo_idx_num_gt1_ - homo_num;
888        if (0 == level) {
889          assert(homo_num <= 65535);
890          node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
891        } else {
892          assert(homo_num <= 255);
893          node_cur_ge1->num_of_homo = (unsigned char)homo_num;
894        }
895
896        for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
897          idx_buf[homo_pos] = lemma_arr_[item_start_next + homo_pos].idx_by_hz;
898        }
899
900#ifdef ___DO_STATISTICS___
901        if (homo_num > max_homobuf_len_[level])
902          max_homobuf_len_[level] = homo_num;
903
904        total_homo_num_[level] += homo_num;
905#endif
906      }
907
908      if (i - item_start_next > homo_num) {
909        void *next_parent;
910        if (0 == level)
911          next_parent = static_cast<void*>(node_cur_le0);
912        else
913          next_parent = static_cast<void*>(node_cur_ge1);
914        construct_subset(next_parent, lemma_arr,
915                         item_start_next + homo_num, i, level + 1);
916#ifdef ___DO_STATISTICS___
917
918        total_node_hasson_[level] += 1;
919        allson_noson = false;
920#endif
921      }
922
923      // for the next son
924      lma_last_start = lma_current;
925      spl_idx_node = spl_idx_current;
926      item_start_next = i;
927      homo_num = 0;
928      if (lma_current->spl_idx_arr[level + 1] == 0)
929        homo_num = 1;
930
931      son_pos++;
932    }
933  }
934
935  // 4. The last one to construct
936  LmaNodeLE0 *node_cur_le0 = NULL;  // only one of them is valid
937  LmaNodeGE1 *node_cur_ge1 = NULL;
938  if (0 == level) {
939    node_cur_le0 = son_1st_le0 + son_pos;
940    node_cur_le0->spl_idx = spl_idx_node;
941    node_cur_le0->homo_idx_buf_off = homo_idx_num_eq1_ + homo_idx_num_gt1_;
942    node_cur_le0->son_1st_off = 0;
943    homo_idx_num_eq1_ += homo_num;
944  } else {
945    node_cur_ge1 = son_1st_ge1 + son_pos;
946    node_cur_ge1->spl_idx = spl_idx_node;
947
948    set_homo_id_buf_offset(node_cur_ge1,
949                           (homo_idx_num_eq1_ + homo_idx_num_gt1_));
950    set_son_offset(node_cur_ge1, 0);
951    homo_idx_num_gt1_ += homo_num;
952  }
953
954  if (homo_num > 0) {
955    LemmaIdType* idx_buf = homo_idx_buf_ + homo_idx_num_eq1_ +
956          homo_idx_num_gt1_ - homo_num;
957    if (0 == level) {
958      assert(homo_num <= 65535);
959      node_cur_le0->num_of_homo = static_cast<uint16>(homo_num);
960    } else {
961      assert(homo_num <= 255);
962      node_cur_ge1->num_of_homo = (unsigned char)homo_num;
963    }
964
965    for (size_t homo_pos = 0; homo_pos < homo_num; homo_pos++) {
966      idx_buf[homo_pos] = lemma_arr[item_start_next + homo_pos].idx_by_hz;
967    }
968
969#ifdef ___DO_STATISTICS___
970    if (homo_num > max_homobuf_len_[level])
971      max_homobuf_len_[level] = homo_num;
972
973    total_homo_num_[level] += homo_num;
974#endif
975  }
976
977  if (item_end - item_start_next > homo_num) {
978    void *next_parent;
979    if (0 == level)
980      next_parent = static_cast<void*>(node_cur_le0);
981    else
982      next_parent = static_cast<void*>(node_cur_ge1);
983    construct_subset(next_parent, lemma_arr,
984                     item_start_next + homo_num, item_end, level + 1);
985#ifdef ___DO_STATISTICS___
986
987    total_node_hasson_[level] += 1;
988    allson_noson = false;
989#endif
990  }
991
992#ifdef ___DO_STATISTICS___
993  if (allson_noson) {
994    total_sonbuf_allnoson_[level] += 1;
995    total_node_in_sonbuf_allnoson_[level] += parent_son_num;
996  }
997#endif
998
999  assert(son_pos + 1 == parent_son_num);
1000  return true;
1001}
1002
1003#ifdef ___DO_STATISTICS___
1004void DictBuilder::stat_init() {
1005  memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
1006  memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
1007  memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1008  memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize);
1009  memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1010  memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
1011  memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
1012  memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize);
1013
1014  sonbufs_num1_ = 0;
1015  sonbufs_numgt1_ = 0;
1016  total_lma_node_num_ = 0;
1017}
1018
1019void DictBuilder::stat_print() {
1020  printf("\n------------STAT INFO-------------\n");
1021  printf("[root is layer -1]\n");
1022  printf(".. max_sonbuf_len per layer(from layer 0):\n   ");
1023  for (size_t i = 0; i < kMaxLemmaSize; i++)
1024    printf("%d, ", max_sonbuf_len_[i]);
1025  printf("-, \n");
1026
1027  printf(".. max_homobuf_len per layer:\n   -, ");
1028  for (size_t i = 0; i < kMaxLemmaSize; i++)
1029    printf("%d, ", max_homobuf_len_[i]);
1030  printf("\n");
1031
1032  printf(".. total_son_num per layer:\n   ");
1033  for (size_t i = 0; i < kMaxLemmaSize; i++)
1034    printf("%d, ", total_son_num_[i]);
1035  printf("-, \n");
1036
1037  printf(".. total_node_hasson per layer:\n   1, ");
1038  for (size_t i = 0; i < kMaxLemmaSize; i++)
1039    printf("%d, ", total_node_hasson_[i]);
1040  printf("\n");
1041
1042  printf(".. total_sonbuf_num per layer:\n   ");
1043  for (size_t i = 0; i < kMaxLemmaSize; i++)
1044    printf("%d, ", total_sonbuf_num_[i]);
1045  printf("-, \n");
1046
1047  printf(".. total_sonbuf_allnoson per layer:\n   ");
1048  for (size_t i = 0; i < kMaxLemmaSize; i++)
1049    printf("%d, ", total_sonbuf_allnoson_[i]);
1050  printf("-, \n");
1051
1052  printf(".. total_node_in_sonbuf_allnoson per layer:\n   ");
1053  for (size_t i = 0; i < kMaxLemmaSize; i++)
1054    printf("%d, ", total_node_in_sonbuf_allnoson_[i]);
1055  printf("-, \n");
1056
1057  printf(".. total_homo_num per layer:\n   0, ");
1058  for (size_t i = 0; i < kMaxLemmaSize; i++)
1059    printf("%d, ", total_homo_num_[i]);
1060  printf("\n");
1061
1062  printf(".. son buf allocation number with only 1 son: %d\n", sonbufs_num1_);
1063  printf(".. son buf allocation number with more than 1 son: %d\n",
1064         sonbufs_numgt1_);
1065  printf(".. total lemma node number: %d\n", total_lma_node_num_ + 1);
1066}
1067#endif  // ___DO_STATISTICS___
1068
1069#endif  // ___BUILD_MODEL___
1070}  // namespace ime_pinyin
1071