1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "../include/userdict.h"
18#include "../include/splparser.h"
19#include "../include/ngram.h"
20#include <stdio.h>
21#include <string.h>
22#include <stdlib.h>
23#include <cutils/log.h>
24#include <unistd.h>
25#include <fcntl.h>
26#include <sys/stat.h>
27#include <assert.h>
28#include <ctype.h>
29#include <sys/types.h>
30#include <sys/time.h>
31#include <time.h>
32#include <pthread.h>
33#include <math.h>
34
35namespace ime_pinyin {
36
37#ifdef ___DEBUG_PERF___
38static uint64 _ellapse_ = 0;
39static struct timeval _tv_start_, _tv_end_;
40#define DEBUG_PERF_BEGIN \
41    do { \
42      gettimeofday(&_tv_start_, NULL); \
43    } while(0)
44#define DEBUG_PERF_END \
45    do { \
46      gettimeofday(&_tv_end_, NULL); \
47      _ellapse_ = (_tv_end_.tv_sec - _tv_start_.tv_sec) * 1000000 + \
48                  (_tv_end_.tv_usec - _tv_start_.tv_usec); \
49    } while(0)
50#define LOGD_PERF(message) \
51    ALOGD("PERFORMANCE[%s] %llu usec.", message, _ellapse_);
52#else
53#define DEBUG_PERF_BEGIN
54#define DEBUG_PERF_END
55#define LOGD_PERF(message)
56#endif
57
58// XXX File load and write are thread-safe by g_mutex_
59static pthread_mutex_t g_mutex_ = PTHREAD_MUTEX_INITIALIZER;
60static struct timeval g_last_update_ = {0, 0};
61
62inline uint32 UserDict::get_dict_file_size(UserDictInfo * info) {
63  return (4 + info->lemma_size + (info->lemma_count << 3)
64#ifdef ___PREDICT_ENABLED___
65          + (info->lemma_count << 2)
66#endif
67#ifdef ___SYNC_ENABLED___
68          + (info->sync_count << 2)
69#endif
70          + sizeof(*info));
71}
72
73inline LmaScoreType UserDict::translate_score(int raw_score) {
74  // 1) ori_freq: original user frequency
75  uint32 ori_freq = extract_score_freq(raw_score);
76  // 2) lmt_off: lmt index (week offset for example)
77  uint64 lmt_off = ((raw_score & 0xffff0000) >> 16);
78  if (kUserDictLMTBitWidth < 16) {
79    uint64 mask = ~(1 << kUserDictLMTBitWidth);
80    lmt_off &= mask;
81  }
82  // 3) now_off: current time index (current week offset for example)
83  // assuming load_time_ is around current time
84  uint64 now_off = load_time_.tv_sec;
85  now_off = (now_off - kUserDictLMTSince) / kUserDictLMTGranularity;
86  now_off = (now_off << (64 - kUserDictLMTBitWidth));
87  now_off = (now_off >> (64 - kUserDictLMTBitWidth));
88  // 4) factor: decide expand-factor
89  int delta = now_off - lmt_off;
90  if (delta > 4)
91    delta = 4;
92  int factor = 80 - (delta << 4);
93
94  double tf = (double)(dict_info_.total_nfreq + total_other_nfreq_);
95  return (LmaScoreType)(log((double)factor * (double)ori_freq / tf)
96                        * NGram::kLogValueAmplifier);
97}
98
99inline int UserDict::extract_score_freq(int raw_score) {
100  // Frequence stored in lowest 16 bits
101  int freq = (raw_score & 0x0000ffff);
102  return freq;
103}
104
105inline uint64 UserDict::extract_score_lmt(int raw_score) {
106  uint64 lmt = ((raw_score & 0xffff0000) >> 16);
107  if (kUserDictLMTBitWidth < 16) {
108    uint64 mask = ~(1 << kUserDictLMTBitWidth);
109    lmt &= mask;
110  }
111  lmt = lmt * kUserDictLMTGranularity + kUserDictLMTSince;
112  return lmt;
113}
114
115inline int UserDict::build_score(uint64 lmt, int freq) {
116  lmt = (lmt - kUserDictLMTSince) / kUserDictLMTGranularity;
117  lmt = (lmt << (64 - kUserDictLMTBitWidth));
118  lmt = (lmt >> (64 - kUserDictLMTBitWidth));
119  uint16 lmt16 = (uint16)lmt;
120  int s = freq;
121  s &= 0x0000ffff;
122  s = (lmt16 << 16) | s;
123  return s;
124}
125
126inline int64 UserDict::utf16le_atoll(uint16 *s, int len) {
127  int64 ret = 0;
128  if (len <= 0)
129    return ret;
130
131  int flag = 1;
132  const uint16 * endp = s + len;
133  if (*s == '-') {
134    flag = -1;
135    s++;
136  } else if (*s == '+') {
137    s++;
138  }
139
140  while (*s >= '0' && *s <= '9' && s < endp) {
141    ret += ret * 10 + (*s) - '0';
142    s++;
143  }
144  return ret * flag;
145}
146
147inline int UserDict::utf16le_lltoa(int64 v, uint16 *s, int size) {
148  if (!s || size <= 0)
149    return 0;
150  uint16 *endp = s + size;
151  int ret_len = 0;
152  if (v < 0) {
153    *(s++) = '-';
154    ++ret_len;
155    v *= -1;
156  }
157
158  uint16 *b = s;
159  while (s < endp && v != 0) {
160    *(s++) = '0' + (v % 10);
161    v = v / 10;
162    ++ret_len;
163  }
164
165  if (v != 0)
166    return 0;
167
168  --s;
169
170  while (b < s) {
171    *b = *s;
172    ++b, --s;
173  }
174
175  return ret_len;
176}
177
178inline void UserDict::set_lemma_flag(uint32 offset, uint8 flag) {
179  offset &= kUserDictOffsetMask;
180  lemmas_[offset] |= flag;
181}
182
183inline char UserDict::get_lemma_flag(uint32 offset) {
184  offset &= kUserDictOffsetMask;
185  return (char)(lemmas_[offset]);
186}
187
188inline char UserDict::get_lemma_nchar(uint32 offset) {
189  offset &= kUserDictOffsetMask;
190  return (char)(lemmas_[offset + 1]);
191}
192
193inline uint16 * UserDict::get_lemma_spell_ids(uint32 offset) {
194  offset &= kUserDictOffsetMask;
195  return (uint16 *)(lemmas_ + offset + 2);
196}
197
198inline uint16 * UserDict::get_lemma_word(uint32 offset) {
199  offset &= kUserDictOffsetMask;
200  uint8 nchar = get_lemma_nchar(offset);
201  return (uint16 *)(lemmas_ + offset + 2 + (nchar << 1));
202}
203
204inline LemmaIdType UserDict::get_max_lemma_id() {
205  // When a lemma is deleted, we don't not claim its id back for
206  // simplicity and performance
207  return start_id_ + dict_info_.lemma_count - 1;
208}
209
210inline bool UserDict::is_valid_lemma_id(LemmaIdType id) {
211  if (id >= start_id_ && id <= get_max_lemma_id())
212    return true;
213  return false;
214}
215
216inline bool UserDict::is_valid_state() {
217  if (state_ == USER_DICT_NONE)
218    return false;
219  return true;
220}
221
222UserDict::UserDict()
223    : start_id_(0),
224      version_(0),
225      lemmas_(NULL),
226      offsets_(NULL),
227      scores_(NULL),
228      ids_(NULL),
229#ifdef ___PREDICT_ENABLED___
230      predicts_(NULL),
231#endif
232#ifdef ___SYNC_ENABLED___
233      syncs_(NULL),
234      sync_count_size_(0),
235#endif
236      offsets_by_id_(NULL),
237      lemma_count_left_(0),
238      lemma_size_left_(0),
239      dict_file_(NULL),
240      state_(USER_DICT_NONE) {
241  memset(&dict_info_, 0, sizeof(dict_info_));
242  memset(&load_time_, 0, sizeof(load_time_));
243#ifdef ___CACHE_ENABLED___
244  cache_init();
245#endif
246}
247
248UserDict::~UserDict() {
249  close_dict();
250}
251
252bool UserDict::load_dict(const char *file_name, LemmaIdType start_id,
253                         LemmaIdType end_id) {
254#ifdef ___DEBUG_PERF___
255  DEBUG_PERF_BEGIN;
256#endif
257  dict_file_ = strdup(file_name);
258  if (!dict_file_)
259    return false;
260
261  start_id_ = start_id;
262
263  if (false == validate(file_name) && false == reset(file_name)) {
264    goto error;
265  }
266  if (false == load(file_name, start_id)) {
267    goto error;
268  }
269
270  state_ = USER_DICT_SYNC;
271
272  gettimeofday(&load_time_, NULL);
273
274#ifdef ___DEBUG_PERF___
275  DEBUG_PERF_END;
276  LOGD_PERF("load_dict");
277#endif
278  return true;
279 error:
280  free((void*)dict_file_);
281  start_id_ = 0;
282  return false;
283}
284
285bool UserDict::close_dict() {
286  if (state_ == USER_DICT_NONE)
287    return true;
288  if (state_ == USER_DICT_SYNC)
289    goto out;
290
291  // If dictionary is written back by others,
292  // we can not simply write back here
293  // To do a safe flush, we have to discard all newly added
294  // lemmas and try to reload dict file.
295  pthread_mutex_lock(&g_mutex_);
296  if (load_time_.tv_sec > g_last_update_.tv_sec ||
297    (load_time_.tv_sec == g_last_update_.tv_sec &&
298     load_time_.tv_usec > g_last_update_.tv_usec)) {
299    write_back();
300    gettimeofday(&g_last_update_, NULL);
301  }
302  pthread_mutex_unlock(&g_mutex_);
303
304 out:
305  free((void*)dict_file_);
306  free(lemmas_);
307  free(offsets_);
308  free(offsets_by_id_);
309  free(scores_);
310  free(ids_);
311#ifdef ___PREDICT_ENABLED___
312  free(predicts_);
313#endif
314
315  version_ = 0;
316  dict_file_ = NULL;
317  lemmas_ = NULL;
318#ifdef ___SYNC_ENABLED___
319  syncs_ = NULL;
320  sync_count_size_ = 0;
321#endif
322  offsets_ = NULL;
323  offsets_by_id_ = NULL;
324  scores_ = NULL;
325  ids_ = NULL;
326#ifdef ___PREDICT_ENABLED___
327  predicts_ = NULL;
328#endif
329
330  memset(&dict_info_, 0, sizeof(dict_info_));
331  lemma_count_left_ = 0;
332  lemma_size_left_ = 0;
333  state_ = USER_DICT_NONE;
334
335  return true;
336}
337
338size_t UserDict::number_of_lemmas() {
339  return dict_info_.lemma_count;
340}
341
342void UserDict::reset_milestones(uint16 from_step, MileStoneHandle from_handle) {
343  return;
344}
345
346MileStoneHandle UserDict::extend_dict(MileStoneHandle from_handle,
347                                      const DictExtPara *dep,
348                                      LmaPsbItem *lpi_items,
349                                      size_t lpi_max, size_t *lpi_num) {
350  if (is_valid_state() == false)
351    return 0;
352
353  bool need_extend = false;
354
355#ifdef ___DEBUG_PERF___
356  DEBUG_PERF_BEGIN;
357#endif
358  *lpi_num = _get_lpis(dep->splids, dep->splids_extended + 1,
359                       lpi_items, lpi_max, &need_extend);
360#ifdef ___DEBUG_PERF___
361  DEBUG_PERF_END;
362  LOGD_PERF("extend_dict");
363#endif
364  return ((*lpi_num > 0 || need_extend) ? 1 : 0);
365}
366
367int UserDict::is_fuzzy_prefix_spell_id(
368    const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
369  if (len1 < searchable->splids_len)
370    return 0;
371
372  SpellingTrie &spl_trie = SpellingTrie::get_instance();
373  uint32 i = 0;
374  for (i = 0; i < searchable->splids_len; i++) {
375    const char py1 = *spl_trie.get_spelling_str(id1[i]);
376    uint16 off = 8 * (i % 4);
377    const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
378    if (py1 == py2)
379      continue;
380    return 0;
381  }
382  return 1;
383}
384
385int UserDict::fuzzy_compare_spell_id(
386    const uint16 * id1, uint16 len1, const UserDictSearchable *searchable) {
387  if (len1 < searchable->splids_len)
388    return -1;
389  if (len1 > searchable->splids_len)
390    return 1;
391
392  SpellingTrie &spl_trie = SpellingTrie::get_instance();
393  uint32 i = 0;
394  for (i = 0; i < len1; i++) {
395    const char py1 = *spl_trie.get_spelling_str(id1[i]);
396    uint16 off = 8 * (i % 4);
397    const char py2 = ((searchable->signature[i/4] & (0xff << off)) >> off);
398    if (py1 == py2)
399      continue;
400    if (py1 > py2)
401      return 1;
402    return -1;
403  }
404  return 0;
405}
406
407bool UserDict::is_prefix_spell_id(
408    const uint16 * fullids, uint16 fulllen,
409    const UserDictSearchable *searchable) {
410  if (fulllen < searchable->splids_len)
411    return false;
412
413  uint32 i = 0;
414  for (; i < searchable->splids_len; i++) {
415    uint16 start_id = searchable->splid_start[i];
416    uint16 count = searchable->splid_count[i];
417    if (fullids[i] >= start_id && fullids[i] < start_id + count)
418      continue;
419    else
420      return false;
421  }
422  return true;
423}
424
425bool UserDict::equal_spell_id(
426    const uint16 * fullids, uint16 fulllen,
427    const UserDictSearchable *searchable) {
428  if (fulllen != searchable->splids_len)
429    return false;
430
431  uint32 i = 0;
432  for (; i < fulllen; i++) {
433    uint16 start_id = searchable->splid_start[i];
434    uint16 count = searchable->splid_count[i];
435    if (fullids[i] >= start_id && fullids[i] < start_id + count)
436      continue;
437    else
438      return false;
439  }
440  return true;
441}
442
443int32 UserDict::locate_first_in_offsets(const UserDictSearchable * searchable) {
444  int32 begin = 0;
445  int32 end = dict_info_.lemma_count - 1;
446  int32 middle = -1;
447
448  int32 first_prefix = middle;
449  int32 last_matched = middle;
450
451  while (begin <= end) {
452    middle = (begin + end) >> 1;
453    uint32 offset = offsets_[middle];
454    uint8 nchar = get_lemma_nchar(offset);
455    const uint16 * splids = get_lemma_spell_ids(offset);
456    int cmp = fuzzy_compare_spell_id(splids, nchar, searchable);
457    int pre = is_fuzzy_prefix_spell_id(splids, nchar, searchable);
458
459    if (pre)
460      first_prefix = middle;
461
462    if (cmp < 0) {
463      begin = middle + 1;
464    } else if (cmp > 0) {
465      end = middle - 1;
466    } else {
467      end = middle - 1;
468      last_matched = middle;
469    }
470  }
471
472  return first_prefix;
473}
474
475void UserDict::prepare_locate(UserDictSearchable *searchable,
476                             const uint16 *splid_str,
477                             uint16 splid_str_len) {
478  searchable->splids_len = splid_str_len;
479  memset(searchable->signature, 0, sizeof(searchable->signature));
480
481  SpellingTrie &spl_trie = SpellingTrie::get_instance();
482  uint32 i = 0;
483  for (; i < splid_str_len; i++) {
484    if (spl_trie.is_half_id(splid_str[i])) {
485      searchable->splid_count[i] =
486          spl_trie.half_to_full(splid_str[i],
487                                &(searchable->splid_start[i]));
488    } else {
489      searchable->splid_count[i] = 1;
490      searchable->splid_start[i] = splid_str[i];
491    }
492    const unsigned char py = *spl_trie.get_spelling_str(splid_str[i]);
493    searchable->signature[i>>2] |= (py << (8 * (i % 4)));
494  }
495}
496
497size_t UserDict::get_lpis(const uint16 *splid_str, uint16 splid_str_len,
498                          LmaPsbItem *lpi_items, size_t lpi_max) {
499  return _get_lpis(splid_str, splid_str_len, lpi_items, lpi_max, NULL);
500}
501
502size_t UserDict::_get_lpis(const uint16 *splid_str,
503                           uint16 splid_str_len, LmaPsbItem *lpi_items,
504                           size_t lpi_max, bool * need_extend) {
505  bool tmp_extend;
506  if (!need_extend)
507    need_extend = &tmp_extend;
508
509  *need_extend = false;
510
511  if (is_valid_state() == false)
512    return 0;
513  if (lpi_max <= 0)
514    return 0;
515
516  if (0 == pthread_mutex_trylock(&g_mutex_)) {
517    if (load_time_.tv_sec < g_last_update_.tv_sec ||
518      (load_time_.tv_sec == g_last_update_.tv_sec &&
519       load_time_.tv_usec < g_last_update_.tv_usec)) {
520      // Others updated disk file, have to reload
521      pthread_mutex_unlock(&g_mutex_);
522      flush_cache();
523    } else {
524      pthread_mutex_unlock(&g_mutex_);
525    }
526  } else {
527  }
528
529  UserDictSearchable searchable;
530  prepare_locate(&searchable, splid_str, splid_str_len);
531
532  uint32 max_off = dict_info_.lemma_count;
533#ifdef ___CACHE_ENABLED___
534  int32 middle;
535  uint32 start, count;
536  bool cached = cache_hit(&searchable, &start, &count);
537  if (cached) {
538    middle = start;
539    max_off = start + count;
540  } else {
541    middle = locate_first_in_offsets(&searchable);
542    start = middle;
543  }
544#else
545  int32 middle = locate_first_in_offsets(&searchable);
546#endif
547
548  if (middle == -1) {
549#ifdef ___CACHE_ENABLED___
550    if (!cached)
551      cache_push(USER_DICT_MISS_CACHE, &searchable, 0, 0);
552#endif
553    return 0;
554  }
555
556  size_t lpi_current = 0;
557
558  bool fuzzy_break = false;
559  bool prefix_break = false;
560  while ((size_t)middle < max_off && !fuzzy_break && !prefix_break) {
561    if (lpi_current >= lpi_max)
562      break;
563    uint32 offset = offsets_[middle];
564    // Ignore deleted lemmas
565    if (offset & kUserDictOffsetFlagRemove) {
566      middle++;
567      continue;
568    }
569    uint8 nchar = get_lemma_nchar(offset);
570    uint16 * splids = get_lemma_spell_ids(offset);
571#ifdef ___CACHE_ENABLED___
572    if (!cached && 0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
573#else
574    if (0 != fuzzy_compare_spell_id(splids, nchar, &searchable)) {
575#endif
576      fuzzy_break = true;
577    }
578
579    if (prefix_break == false) {
580      if (is_fuzzy_prefix_spell_id(splids, nchar, &searchable)) {
581        if (*need_extend == false &&
582            is_prefix_spell_id(splids, nchar, &searchable)) {
583          *need_extend = true;
584        }
585      } else {
586        prefix_break = true;
587      }
588    }
589
590    if (equal_spell_id(splids, nchar, &searchable) == true) {
591      lpi_items[lpi_current].psb = translate_score(scores_[middle]);
592      lpi_items[lpi_current].id = ids_[middle];
593      lpi_items[lpi_current].lma_len = nchar;
594      lpi_current++;
595    }
596    middle++;
597  }
598
599#ifdef ___CACHE_ENABLED___
600  if (!cached) {
601    count = middle - start;
602    cache_push(USER_DICT_CACHE, &searchable, start, count);
603  }
604#endif
605
606  return lpi_current;
607}
608
609uint16 UserDict::get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
610                               uint16 str_max) {
611  if (is_valid_state() == false)
612    return 0;
613  if (is_valid_lemma_id(id_lemma) == false)
614    return 0;
615  uint32 offset = offsets_by_id_[id_lemma - start_id_];
616  uint8 nchar = get_lemma_nchar(offset);
617  char16 * str = get_lemma_word(offset);
618  uint16 m = nchar < str_max -1 ? nchar : str_max - 1;
619  int i = 0;
620  for (; i < m; i++) {
621    str_buf[i] = str[i];
622  }
623  str_buf[i] = 0;
624  return m;
625}
626
627uint16 UserDict::get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
628                                  uint16 splids_max, bool arg_valid) {
629  if (is_valid_lemma_id(id_lemma) == false)
630    return 0;
631  uint32 offset = offsets_by_id_[id_lemma - start_id_];
632  uint8 nchar = get_lemma_nchar(offset);
633  const uint16 * ids = get_lemma_spell_ids(offset);
634  int i = 0;
635  for (; i < nchar && i < splids_max; i++)
636    splids[i] = ids[i];
637  return i;
638}
639
640size_t UserDict::predict(const char16 last_hzs[], uint16 hzs_len,
641                         NPredictItem *npre_items, size_t npre_max,
642                         size_t b4_used) {
643  uint32 new_added = 0;
644#ifdef ___PREDICT_ENABLED___
645  int32 end = dict_info_.lemma_count - 1;
646  int j = locate_first_in_predicts((const uint16*)last_hzs, hzs_len);
647  if (j == -1)
648    return 0;
649
650  while (j <= end) {
651    uint32 offset = predicts_[j];
652    // Ignore deleted lemmas
653    if (offset & kUserDictOffsetFlagRemove) {
654      j++;
655      continue;
656    }
657    uint32 nchar = get_lemma_nchar(offset);
658    uint16 * words = get_lemma_word(offset);
659    uint16 * splids = get_lemma_spell_ids(offset);
660
661    if (nchar <= hzs_len) {
662      j++;
663      continue;
664    }
665
666    if (memcmp(words, last_hzs, hzs_len << 1) == 0) {
667      if (new_added >= npre_max) {
668        return new_added;
669      }
670      uint32 cpy_len =
671          (nchar < kMaxPredictSize ? (nchar << 1) : (kMaxPredictSize << 1))
672          - (hzs_len << 1);
673      npre_items[new_added].his_len = hzs_len;
674      npre_items[new_added].psb = get_lemma_score(words, splids, nchar);
675      memcpy(npre_items[new_added].pre_hzs, words + hzs_len, cpy_len);
676      if ((cpy_len >> 1) < kMaxPredictSize) {
677        npre_items[new_added].pre_hzs[cpy_len >> 1] = 0;
678      }
679      new_added++;
680    } else {
681      break;
682    }
683
684    j++;
685  }
686#endif
687  return new_added;
688}
689
690int32 UserDict::locate_in_offsets(char16 lemma_str[], uint16 splid_str[],
691                                  uint16 lemma_len) {
692  int32 max_off = dict_info_.lemma_count;
693
694  UserDictSearchable searchable;
695  prepare_locate(&searchable, splid_str, lemma_len);
696#ifdef ___CACHE_ENABLED___
697  int32 off;
698  uint32 start, count;
699  bool cached = load_cache(&searchable, &start, &count);
700  if (cached) {
701    off = start;
702    max_off = start + count;
703  } else {
704    off = locate_first_in_offsets(&searchable);
705    start = off;
706  }
707#else
708  int32 off = locate_first_in_offsets(&searchable);
709#endif
710
711  if (off == -1) {
712    return off;
713  }
714
715  while (off < max_off) {
716    uint32 offset = offsets_[off];
717    if (offset & kUserDictOffsetFlagRemove) {
718      off++;
719      continue;
720    }
721    uint16 * splids = get_lemma_spell_ids(offset);
722#ifdef ___CACHE_ENABLED___
723    if (!cached && 0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
724      break;
725#else
726    if (0 != fuzzy_compare_spell_id(splids, lemma_len, &searchable))
727      break;
728#endif
729    if (equal_spell_id(splids, lemma_len, &searchable) == true) {
730      uint16 * str = get_lemma_word(offset);
731      uint32 i = 0;
732      for (i = 0; i < lemma_len; i++) {
733        if (str[i] == lemma_str[i])
734          continue;
735        break;
736      }
737      if (i < lemma_len) {
738        off++;
739        continue;
740      }
741#ifdef ___CACHE_ENABLED___
742      // No need to save_cache here, since current function is invoked by
743      // put_lemma. It's rarely possible for a user input same lemma twice.
744      // That means first time user type a new lemma, it is newly added into
745      // user dictionary, then it's possible that user type the same lemma
746      // again.
747      // Another reason save_cache can not be invoked here is this function
748      // aborts when lemma is found, and it never knows the count.
749#endif
750      return off;
751    }
752    off++;
753  }
754
755  return -1;
756}
757
758#ifdef ___PREDICT_ENABLED___
759uint32 UserDict::locate_where_to_insert_in_predicts(
760    const uint16 * words, int lemma_len) {
761  int32 begin = 0;
762  int32 end = dict_info_.lemma_count - 1;
763  int32 middle = end;
764
765  uint32 last_matched = middle;
766
767  while (begin <= end) {
768    middle = (begin + end) >> 1;
769    uint32 offset = offsets_[middle];
770    uint8 nchar = get_lemma_nchar(offset);
771    const uint16 * ws = get_lemma_word(offset);
772
773    uint32 minl = nchar < lemma_len ? nchar : lemma_len;
774    uint32 k = 0;
775    int cmp = 0;
776
777    for (; k < minl; k++) {
778      if (ws[k] < words[k]) {
779        cmp = -1;
780        break;
781      } else if (ws[k] > words[k]) {
782        cmp = 1;
783        break;
784      }
785    }
786    if (cmp == 0) {
787      if (nchar < lemma_len)
788        cmp = -1;
789      else if (nchar > lemma_len)
790        cmp = 1;
791    }
792
793    if (cmp < 0) {
794      begin = middle + 1;
795      last_matched = middle;
796    } else if (cmp > 0) {
797      end = middle - 1;
798    } else {
799      end = middle - 1;
800      last_matched = middle;
801    }
802  }
803
804  return last_matched;
805}
806
807int32 UserDict::locate_first_in_predicts(const uint16 * words, int lemma_len) {
808  int32 begin = 0;
809  int32 end = dict_info_.lemma_count - 1;
810  int32 middle = -1;
811
812  int32 last_matched = middle;
813
814  while (begin <= end) {
815    middle = (begin + end) >> 1;
816    uint32 offset = offsets_[middle];
817    uint8 nchar = get_lemma_nchar(offset);
818    const uint16 * ws = get_lemma_word(offset);
819
820    uint32 minl = nchar < lemma_len ? nchar : lemma_len;
821    uint32 k = 0;
822    int cmp = 0;
823
824    for (; k < minl; k++) {
825      if (ws[k] < words[k]) {
826        cmp = -1;
827        break;
828      } else if (ws[k] > words[k]) {
829        cmp = 1;
830        break;
831      }
832    }
833    if (cmp == 0) {
834      if (nchar >= lemma_len)
835        last_matched = middle;
836      if (nchar < lemma_len)
837        cmp = -1;
838      else if (nchar > lemma_len)
839        cmp = 1;
840    }
841
842    if (cmp < 0) {
843      begin = middle + 1;
844    } else if (cmp > 0) {
845      end = middle - 1;
846    } else {
847      end = middle - 1;
848    }
849  }
850
851  return last_matched;
852}
853
854#endif
855
856LemmaIdType UserDict::get_lemma_id(char16 lemma_str[], uint16 splids[],
857                                   uint16 lemma_len) {
858  int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
859  if (off == -1) {
860    return 0;
861  }
862
863  return ids_[off];
864}
865
866LmaScoreType UserDict::get_lemma_score(LemmaIdType lemma_id) {
867  if (is_valid_state() == false)
868    return 0;
869  if (is_valid_lemma_id(lemma_id) == false)
870    return 0;
871
872  return translate_score(_get_lemma_score(lemma_id));
873}
874
875LmaScoreType UserDict::get_lemma_score(char16 lemma_str[], uint16 splids[],
876                                uint16 lemma_len) {
877  if (is_valid_state() == false)
878    return 0;
879  return translate_score(_get_lemma_score(lemma_str, splids, lemma_len));
880}
881
882int UserDict::_get_lemma_score(LemmaIdType lemma_id) {
883  if (is_valid_state() == false)
884    return 0;
885  if (is_valid_lemma_id(lemma_id) == false)
886    return 0;
887
888  uint32 offset = offsets_by_id_[lemma_id - start_id_];
889
890  uint32 nchar = get_lemma_nchar(offset);
891  uint16 * spl = get_lemma_spell_ids(offset);
892  uint16 * wrd = get_lemma_word(offset);
893
894  int32 off = locate_in_offsets(wrd, spl, nchar);
895  if (off == -1) {
896    return 0;
897  }
898
899  return scores_[off];
900}
901
902int UserDict::_get_lemma_score(char16 lemma_str[], uint16 splids[],
903                                uint16 lemma_len) {
904  if (is_valid_state() == false)
905    return 0;
906
907  int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
908  if (off == -1) {
909    return 0;
910  }
911
912  return scores_[off];
913}
914
915#ifdef ___SYNC_ENABLED___
916void UserDict::remove_lemma_from_sync_list(uint32 offset) {
917  offset &= kUserDictOffsetMask;
918  uint32 i = 0;
919  for (; i < dict_info_.sync_count; i++) {
920    unsigned int off = (syncs_[i] & kUserDictOffsetMask);
921    if (off == offset)
922      break;
923  }
924  if (i < dict_info_.sync_count) {
925    syncs_[i] = syncs_[dict_info_.sync_count - 1];
926    dict_info_.sync_count--;
927  }
928}
929#endif
930
931#ifdef ___PREDICT_ENABLED___
932void UserDict::remove_lemma_from_predict_list(uint32 offset) {
933  offset &= kUserDictOffsetMask;
934  uint32 i = 0;
935  for (; i < dict_info_.lemma_count; i++) {
936    unsigned int off = (predicts_[i] & kUserDictOffsetMask);
937    if (off == offset) {
938      predicts_[i] |= kUserDictOffsetFlagRemove;
939      break;
940    }
941  }
942}
943#endif
944
945bool UserDict::remove_lemma_by_offset_index(int offset_index) {
946  if (is_valid_state() == false)
947    return 0;
948
949  int32 off = offset_index;
950  if (off == -1) {
951    return false;
952  }
953
954  uint32 offset = offsets_[off];
955  uint32 nchar = get_lemma_nchar(offset);
956
957  offsets_[off] |= kUserDictOffsetFlagRemove;
958
959#ifdef ___SYNC_ENABLED___
960  // Remove corresponding sync item
961  remove_lemma_from_sync_list(offset);
962#endif
963
964#ifdef ___PREDICT_ENABLED___
965  remove_lemma_from_predict_list(offset);
966#endif
967  dict_info_.free_count++;
968  dict_info_.free_size += (2 + (nchar << 2));
969
970  if (state_ < USER_DICT_OFFSET_DIRTY)
971    state_ = USER_DICT_OFFSET_DIRTY;
972  return true;
973}
974
975bool UserDict::remove_lemma(LemmaIdType lemma_id) {
976  if (is_valid_state() == false)
977    return 0;
978  if (is_valid_lemma_id(lemma_id) == false)
979    return false;
980  uint32 offset = offsets_by_id_[lemma_id - start_id_];
981
982  uint32 nchar = get_lemma_nchar(offset);
983  uint16 * spl = get_lemma_spell_ids(offset);
984  uint16 * wrd = get_lemma_word(offset);
985
986  int32 off = locate_in_offsets(wrd, spl, nchar);
987
988  return remove_lemma_by_offset_index(off);
989}
990
991void UserDict::flush_cache() {
992  LemmaIdType start_id = start_id_;
993  const char * file = strdup(dict_file_);
994  if (!file)
995    return;
996  close_dict();
997  load_dict(file, start_id, kUserDictIdEnd);
998  free((void*)file);
999#ifdef ___CACHE_ENABLED___
1000  cache_init();
1001#endif
1002  return;
1003}
1004
1005bool UserDict::reset(const char *file) {
1006  FILE *fp = fopen(file, "w+");
1007  if (!fp) {
1008    return false;
1009  }
1010  uint32 version = kUserDictVersion;
1011  size_t wred = fwrite(&version, 1, 4, fp);
1012  UserDictInfo info;
1013  memset(&info, 0, sizeof(info));
1014  // By default, no limitation for lemma count and size
1015  // thereby, reclaim_ratio is never used
1016  wred += fwrite(&info, 1, sizeof(info), fp);
1017  if (wred != sizeof(info) + sizeof(version)) {
1018    fclose(fp);
1019    unlink(file);
1020    return false;
1021  }
1022  fclose(fp);
1023  return true;
1024}
1025
1026bool UserDict::validate(const char *file) {
1027  // b is ignored in POSIX compatible os including Linux
1028  // while b is important flag for Windows to specify binary mode
1029  FILE *fp = fopen(file, "rb");
1030  if (!fp) {
1031    return false;
1032  }
1033
1034  size_t size;
1035  size_t readed;
1036  uint32 version;
1037  UserDictInfo dict_info;
1038
1039  // validate
1040  int err = fseek(fp, 0, SEEK_END);
1041  if (err) {
1042    goto error;
1043  }
1044
1045  size = ftell(fp);
1046  if (size < 4 + sizeof(dict_info)) {
1047    goto error;
1048  }
1049
1050  err = fseek(fp, 0, SEEK_SET);
1051  if (err) {
1052    goto error;
1053  }
1054
1055  readed = fread(&version, 1, sizeof(version), fp);
1056  if (readed < sizeof(version)) {
1057    goto error;
1058  }
1059  if (version != kUserDictVersion) {
1060    goto error;
1061  }
1062
1063  err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1064  if (err) {
1065    goto error;
1066  }
1067
1068  readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1069  if (readed != sizeof(dict_info)) {
1070    goto error;
1071  }
1072
1073  if (size != get_dict_file_size(&dict_info)) {
1074    goto error;
1075  }
1076
1077  fclose(fp);
1078  return true;
1079
1080 error:
1081  fclose(fp);
1082  return false;
1083}
1084
1085bool UserDict::load(const char *file, LemmaIdType start_id) {
1086  if (0 != pthread_mutex_trylock(&g_mutex_)) {
1087    return false;
1088  }
1089  // b is ignored in POSIX compatible os including Linux
1090  // while b is important flag for Windows to specify binary mode
1091  FILE *fp = fopen(file, "rb");
1092  if (!fp) {
1093    pthread_mutex_unlock(&g_mutex_);
1094    return false;
1095  }
1096
1097  size_t readed, toread;
1098  UserDictInfo dict_info;
1099  uint8 *lemmas = NULL;
1100  uint32 *offsets = NULL;
1101#ifdef ___SYNC_ENABLED___
1102  uint32 *syncs = NULL;
1103#endif
1104  uint32 *scores = NULL;
1105  uint32 *ids = NULL;
1106  uint32 *offsets_by_id = NULL;
1107#ifdef ___PREDICT_ENABLED___
1108  uint32 *predicts = NULL;
1109#endif
1110  size_t i;
1111  int err;
1112
1113  err = fseek(fp, -1 * sizeof(dict_info), SEEK_END);
1114  if (err) goto error;
1115
1116  readed = fread(&dict_info, 1, sizeof(dict_info), fp);
1117  if (readed != sizeof(dict_info)) goto error;
1118
1119  lemmas = (uint8 *)malloc(
1120      dict_info.lemma_size +
1121      (kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2))));
1122
1123  if (!lemmas) goto error;
1124
1125  offsets = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1126  if (!offsets) goto error;
1127
1128#ifdef ___PREDICT_ENABLED___
1129  predicts = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1130  if (!predicts) goto error;
1131#endif
1132
1133#ifdef ___SYNC_ENABLED___
1134  syncs = (uint32 *)malloc((dict_info.sync_count + kUserDictPreAlloc) << 2);
1135  if (!syncs) goto error;
1136#endif
1137
1138  scores = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1139  if (!scores) goto error;
1140
1141  ids = (uint32 *)malloc((dict_info.lemma_count + kUserDictPreAlloc) << 2);
1142  if (!ids) goto error;
1143
1144  offsets_by_id = (uint32 *)malloc(
1145      (dict_info.lemma_count + kUserDictPreAlloc) << 2);
1146  if (!offsets_by_id) goto error;
1147
1148  err = fseek(fp, 4, SEEK_SET);
1149  if (err) goto error;
1150
1151  readed = 0;
1152  while (readed < dict_info.lemma_size && !ferror(fp) && !feof(fp)) {
1153    readed += fread(lemmas + readed, 1, dict_info.lemma_size - readed, fp);
1154  }
1155  if (readed < dict_info.lemma_size)
1156    goto error;
1157
1158  toread = (dict_info.lemma_count << 2);
1159  readed = 0;
1160  while (readed < toread && !ferror(fp) && !feof(fp)) {
1161    readed += fread((((uint8*)offsets) + readed), 1, toread - readed, fp);
1162  }
1163  if (readed < toread)
1164    goto error;
1165
1166#ifdef ___PREDICT_ENABLED___
1167  toread = (dict_info.lemma_count << 2);
1168  readed = 0;
1169  while (readed < toread && !ferror(fp) && !feof(fp)) {
1170    readed += fread((((uint8*)predicts) + readed), 1, toread - readed, fp);
1171  }
1172  if (readed < toread)
1173    goto error;
1174#endif
1175
1176  readed = 0;
1177  while (readed < toread && !ferror(fp) && !feof(fp)) {
1178    readed += fread((((uint8*)scores) + readed), 1, toread - readed, fp);
1179  }
1180  if (readed < toread)
1181    goto error;
1182
1183#ifdef ___SYNC_ENABLED___
1184  toread = (dict_info.sync_count << 2);
1185  readed = 0;
1186  while (readed < toread && !ferror(fp) && !feof(fp)) {
1187    readed += fread((((uint8*)syncs) + readed), 1, toread - readed, fp);
1188  }
1189  if (readed < toread)
1190    goto error;
1191#endif
1192
1193  for (i = 0; i < dict_info.lemma_count; i++) {
1194    ids[i] = start_id + i;
1195    offsets_by_id[i] = offsets[i];
1196  }
1197
1198  lemmas_ = lemmas;
1199  offsets_ = offsets;
1200#ifdef ___SYNC_ENABLED___
1201  syncs_ = syncs;
1202  sync_count_size_ = dict_info.sync_count + kUserDictPreAlloc;
1203#endif
1204  offsets_by_id_ = offsets_by_id;
1205  scores_ = scores;
1206  ids_ = ids;
1207#ifdef ___PREDICT_ENABLED___
1208  predicts_ = predicts;
1209#endif
1210  lemma_count_left_ = kUserDictPreAlloc;
1211  lemma_size_left_ = kUserDictPreAlloc * (2 + (kUserDictAverageNchar << 2));
1212  memcpy(&dict_info_, &dict_info, sizeof(dict_info));
1213  state_ = USER_DICT_SYNC;
1214
1215  fclose(fp);
1216
1217  pthread_mutex_unlock(&g_mutex_);
1218  return true;
1219
1220 error:
1221  if (lemmas) free(lemmas);
1222  if (offsets) free(offsets);
1223#ifdef ___SYNC_ENABLED___
1224  if (syncs) free(syncs);
1225#endif
1226  if (scores) free(scores);
1227  if (ids) free(ids);
1228  if (offsets_by_id) free(offsets_by_id);
1229#ifdef ___PREDICT_ENABLED___
1230  if (predicts) free(predicts);
1231#endif
1232  fclose(fp);
1233  pthread_mutex_unlock(&g_mutex_);
1234  return false;
1235}
1236
1237void UserDict::write_back() {
1238  // XXX write back is only allowed from close_dict due to thread-safe sake
1239  if (state_ == USER_DICT_NONE || state_ == USER_DICT_SYNC)
1240    return;
1241  int fd = open(dict_file_, O_WRONLY);
1242  if (fd == -1)
1243    return;
1244  switch (state_) {
1245    case USER_DICT_DEFRAGMENTED:
1246      write_back_all(fd);
1247      break;
1248    case USER_DICT_LEMMA_DIRTY:
1249      write_back_lemma(fd);
1250      break;
1251    case USER_DICT_OFFSET_DIRTY:
1252      write_back_offset(fd);
1253      break;
1254    case USER_DICT_SCORE_DIRTY:
1255      write_back_score(fd);
1256      break;
1257#ifdef ___SYNC_ENABLED___
1258    case USER_DICT_SYNC_DIRTY:
1259      write_back_sync(fd);
1260      break;
1261#endif
1262    default:
1263      break;
1264  }
1265  // It seems truncate is not need on Linux, Windows except Mac
1266  // I am doing it here anyway for safety.
1267  off_t cur = lseek(fd, 0, SEEK_CUR);
1268  ftruncate(fd, cur);
1269  close(fd);
1270  state_ = USER_DICT_SYNC;
1271}
1272
1273#ifdef ___SYNC_ENABLED___
1274void UserDict::write_back_sync(int fd) {
1275  int err = lseek(fd, 4 + dict_info_.lemma_size
1276                  + (dict_info_.lemma_count << 3)
1277#ifdef ___PREDICT_ENABLED___
1278                  + (dict_info_.lemma_count << 2)
1279#endif
1280                  , SEEK_SET);
1281  if (err == -1)
1282    return;
1283  write(fd, syncs_, dict_info_.sync_count << 2);
1284  write(fd, &dict_info_, sizeof(dict_info_));
1285}
1286#endif
1287
1288void UserDict::write_back_offset(int fd) {
1289  int err = lseek(fd, 4 + dict_info_.lemma_size, SEEK_SET);
1290  if (err == -1)
1291    return;
1292  write(fd, offsets_, dict_info_.lemma_count << 2);
1293#ifdef ___PREDICT_ENABLED___
1294  write(fd, predicts_, dict_info_.lemma_count << 2);
1295#endif
1296  write(fd, scores_, dict_info_.lemma_count << 2);
1297#ifdef ___SYNC_ENABLED___
1298  write(fd, syncs_, dict_info_.sync_count << 2);
1299#endif
1300  write(fd, &dict_info_, sizeof(dict_info_));
1301}
1302
1303void UserDict::write_back_score(int fd) {
1304  int err = lseek(fd, 4 + dict_info_.lemma_size
1305                  + (dict_info_.lemma_count << 2)
1306#ifdef ___PREDICT_ENABLED___
1307                  + (dict_info_.lemma_count << 2)
1308#endif
1309                  , SEEK_SET);
1310  if (err == -1)
1311    return;
1312  write(fd, scores_, dict_info_.lemma_count << 2);
1313#ifdef ___SYNC_ENABLED___
1314  write(fd, syncs_, dict_info_.sync_count << 2);
1315#endif
1316  write(fd, &dict_info_, sizeof(dict_info_));
1317}
1318
1319void UserDict::write_back_lemma(int fd) {
1320  int err = lseek(fd, 4, SEEK_SET);
1321  if (err == -1)
1322    return;
1323  // New lemmas are always appended, no need to write whole lemma block
1324  size_t need_write = kUserDictPreAlloc *
1325      (2 + (kUserDictAverageNchar << 2)) - lemma_size_left_;
1326  err = lseek(fd, dict_info_.lemma_size - need_write, SEEK_CUR);
1327  if (err == -1)
1328    return;
1329  write(fd, lemmas_ + dict_info_.lemma_size - need_write, need_write);
1330
1331  write(fd, offsets_,  dict_info_.lemma_count << 2);
1332#ifdef ___PREDICT_ENABLED___
1333  write(fd, predicts_,  dict_info_.lemma_count << 2);
1334#endif
1335  write(fd, scores_, dict_info_.lemma_count << 2);
1336#ifdef ___SYNC_ENABLED___
1337  write(fd, syncs_, dict_info_.sync_count << 2);
1338#endif
1339  write(fd, &dict_info_, sizeof(dict_info_));
1340}
1341
1342void UserDict::write_back_all(int fd) {
1343  // XXX lemma_size is handled differently in writeall
1344  // and writelemma. I update lemma_size and lemma_count in different
1345  // places for these two cases. Should fix it to make it consistent.
1346  int err = lseek(fd, 4, SEEK_SET);
1347  if (err == -1)
1348    return;
1349  write(fd, lemmas_, dict_info_.lemma_size);
1350  write(fd, offsets_, dict_info_.lemma_count << 2);
1351#ifdef ___PREDICT_ENABLED___
1352  write(fd, predicts_, dict_info_.lemma_count << 2);
1353#endif
1354  write(fd, scores_, dict_info_.lemma_count << 2);
1355#ifdef ___SYNC_ENABLED___
1356  write(fd, syncs_, dict_info_.sync_count << 2);
1357#endif
1358  write(fd, &dict_info_, sizeof(dict_info_));
1359}
1360
1361#ifdef ___CACHE_ENABLED___
1362bool UserDict::load_cache(UserDictSearchable *searchable,
1363                          uint32 *offset, uint32 *length) {
1364  UserDictCache *cache = &caches_[searchable->splids_len - 1];
1365  if (cache->head == cache->tail)
1366    return false;
1367
1368  uint16 j, sig_len = kMaxLemmaSize / 4;
1369  uint16 i = cache->head;
1370  while (1) {
1371    j = 0;
1372    for (; j < sig_len; j++) {
1373      if (cache->signatures[i][j] != searchable->signature[j])
1374        break;
1375    }
1376    if (j < sig_len) {
1377      i++;
1378      if (i >= kUserDictCacheSize)
1379        i -= kUserDictCacheSize;
1380      if (i == cache->tail)
1381        break;
1382      continue;
1383    }
1384    *offset = cache->offsets[i];
1385    *length = cache->lengths[i];
1386    return true;
1387  }
1388  return false;
1389}
1390
1391void UserDict::save_cache(UserDictSearchable *searchable,
1392                          uint32 offset, uint32 length) {
1393  UserDictCache *cache = &caches_[searchable->splids_len - 1];
1394  uint16 next = cache->tail;
1395
1396  cache->offsets[next] = offset;
1397  cache->lengths[next] = length;
1398  uint16 sig_len = kMaxLemmaSize / 4;
1399  uint16 j = 0;
1400  for (; j < sig_len; j++) {
1401    cache->signatures[next][j] = searchable->signature[j];
1402  }
1403
1404  if (++next >= kUserDictCacheSize) {
1405    next -= kUserDictCacheSize;
1406  }
1407  if (next == cache->head) {
1408    cache->head++;
1409    if (cache->head >= kUserDictCacheSize) {
1410      cache->head -= kUserDictCacheSize;
1411    }
1412  }
1413  cache->tail = next;
1414}
1415
1416void UserDict::reset_cache() {
1417  memset(caches_, 0, sizeof(caches_));
1418}
1419
1420bool UserDict::load_miss_cache(UserDictSearchable *searchable) {
1421  UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1422  if (cache->head == cache->tail)
1423    return false;
1424
1425  uint16 j, sig_len = kMaxLemmaSize / 4;
1426  uint16 i = cache->head;
1427  while (1) {
1428    j = 0;
1429    for (; j < sig_len; j++) {
1430      if (cache->signatures[i][j] != searchable->signature[j])
1431        break;
1432    }
1433    if (j < sig_len) {
1434      i++;
1435      if (i >= kUserDictMissCacheSize)
1436        i -= kUserDictMissCacheSize;
1437      if (i == cache->tail)
1438        break;
1439      continue;
1440    }
1441    return true;
1442  }
1443  return false;
1444}
1445
1446void UserDict::save_miss_cache(UserDictSearchable *searchable) {
1447  UserDictMissCache *cache = &miss_caches_[searchable->splids_len - 1];
1448  uint16 next = cache->tail;
1449
1450  uint16 sig_len = kMaxLemmaSize / 4;
1451  uint16 j = 0;
1452  for (; j < sig_len; j++) {
1453    cache->signatures[next][j] = searchable->signature[j];
1454  }
1455
1456  if (++next >= kUserDictMissCacheSize) {
1457    next -= kUserDictMissCacheSize;
1458  }
1459  if (next == cache->head) {
1460    cache->head++;
1461    if (cache->head >= kUserDictMissCacheSize) {
1462      cache->head -= kUserDictMissCacheSize;
1463    }
1464  }
1465  cache->tail = next;
1466}
1467
1468void UserDict::reset_miss_cache() {
1469  memset(miss_caches_, 0, sizeof(miss_caches_));
1470}
1471
1472void UserDict::cache_init() {
1473  reset_cache();
1474  reset_miss_cache();
1475}
1476
1477bool UserDict::cache_hit(UserDictSearchable *searchable,
1478                         uint32 *offset, uint32 *length) {
1479  bool hit = load_miss_cache(searchable);
1480  if (hit) {
1481    *offset = 0;
1482    *length = 0;
1483    return true;
1484  }
1485  hit = load_cache(searchable, offset, length);
1486  if (hit) {
1487    return true;
1488  }
1489  return false;
1490}
1491
1492void UserDict::cache_push(UserDictCacheType type,
1493                         UserDictSearchable *searchable,
1494                         uint32 offset, uint32 length) {
1495  switch (type) {
1496    case USER_DICT_MISS_CACHE:
1497      save_miss_cache(searchable);
1498      break;
1499    case USER_DICT_CACHE:
1500      save_cache(searchable, offset, length);
1501      break;
1502    default:
1503      break;
1504  }
1505}
1506
1507#endif
1508
1509void UserDict::defragment(void) {
1510#ifdef ___DEBUG_PERF___
1511  DEBUG_PERF_BEGIN;
1512#endif
1513  if (is_valid_state() == false)
1514    return;
1515  // Fixup offsets_, set REMOVE flag to lemma's flag if needed
1516  size_t first_freed = 0;
1517  size_t first_inuse = 0;
1518  while (first_freed < dict_info_.lemma_count) {
1519    // Find first freed offset
1520    while ((offsets_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1521            first_freed < dict_info_.lemma_count) {
1522      first_freed++;
1523    }
1524    if (first_freed < dict_info_.lemma_count) {
1525      // Save REMOVE flag to lemma flag
1526      int off = offsets_[first_freed];
1527      set_lemma_flag(off, kUserDictLemmaFlagRemove);
1528    } else {
1529      break;
1530    }
1531    // Find first inuse offse after first_freed
1532    first_inuse = first_freed + 1;
1533    while ((offsets_[first_inuse] & kUserDictOffsetFlagRemove) &&
1534           (first_inuse < dict_info_.lemma_count)) {
1535      // Save REMOVE flag to lemma flag
1536      int off = offsets_[first_inuse];
1537      set_lemma_flag(off, kUserDictLemmaFlagRemove);
1538      first_inuse++;
1539    }
1540    if (first_inuse >= dict_info_.lemma_count) {
1541      break;
1542    }
1543    // Swap offsets_
1544    int tmp = offsets_[first_inuse];
1545    offsets_[first_inuse] = offsets_[first_freed];
1546    offsets_[first_freed] = tmp;
1547    // Move scores_, no need to swap
1548    tmp = scores_[first_inuse];
1549    scores_[first_inuse] = scores_[first_freed];
1550    scores_[first_freed] = tmp;
1551    // Swap ids_
1552    LemmaIdType tmpid = ids_[first_inuse];
1553    ids_[first_inuse] = ids_[first_freed];
1554    ids_[first_freed] = tmpid;
1555    // Go on
1556    first_freed++;
1557  }
1558#ifdef ___PREDICT_ENABLED___
1559  // Fixup predicts_
1560  first_freed = 0;
1561  first_inuse = 0;
1562  while (first_freed < dict_info_.lemma_count) {
1563    // Find first freed offset
1564    while ((predicts_[first_freed] & kUserDictOffsetFlagRemove) == 0 &&
1565            first_freed < dict_info_.lemma_count) {
1566      first_freed++;
1567    }
1568    if (first_freed >= dict_info_.lemma_count)
1569      break;
1570    // Find first inuse offse after first_freed
1571    first_inuse = first_freed + 1;
1572    while ((predicts_[first_inuse] & kUserDictOffsetFlagRemove)
1573           && (first_inuse < dict_info_.lemma_count)) {
1574      first_inuse++;
1575    }
1576    if (first_inuse >= dict_info_.lemma_count) {
1577      break;
1578    }
1579    // Swap offsets_
1580    int tmp = predicts_[first_inuse];
1581    predicts_[first_inuse] = predicts_[first_freed];
1582    predicts_[first_freed] = tmp;
1583    // Go on
1584    first_freed++;
1585  }
1586#endif
1587  dict_info_.lemma_count = first_freed;
1588  // Fixup lemmas_
1589  size_t begin = 0;
1590  size_t end = 0;
1591  size_t dst = 0;
1592  int total_size = dict_info_.lemma_size + lemma_size_left_;
1593  int total_count = dict_info_.lemma_count + lemma_count_left_;
1594  size_t real_size = total_size - lemma_size_left_;
1595  while (dst < real_size) {
1596    unsigned char flag = get_lemma_flag(dst);
1597    unsigned char nchr = get_lemma_nchar(dst);
1598    if ((flag & kUserDictLemmaFlagRemove) == 0) {
1599      dst += nchr * 4 + 2;
1600      continue;
1601    }
1602    break;
1603  }
1604  if (dst >= real_size)
1605    return;
1606
1607  end = dst;
1608  while (end < real_size) {
1609    begin = end + get_lemma_nchar(end) * 4 + 2;
1610 repeat:
1611    // not used any more
1612    if (begin >= real_size)
1613      break;
1614    unsigned char flag = get_lemma_flag(begin);
1615    unsigned char nchr = get_lemma_nchar(begin);
1616    if (flag & kUserDictLemmaFlagRemove) {
1617      begin += nchr * 4 + 2;
1618      goto repeat;
1619    }
1620    end = begin + nchr * 4 + 2;
1621    while (end < real_size) {
1622      unsigned char eflag = get_lemma_flag(end);
1623      unsigned char enchr = get_lemma_nchar(end);
1624      if ((eflag & kUserDictLemmaFlagRemove) == 0) {
1625        end += enchr * 4 + 2;
1626        continue;
1627      }
1628      break;
1629    }
1630    memmove(lemmas_ + dst, lemmas_ + begin, end - begin);
1631    for (size_t j = 0; j < dict_info_.lemma_count; j++) {
1632      if (offsets_[j] >= begin && offsets_[j] < end) {
1633        offsets_[j] -= (begin - dst);
1634        offsets_by_id_[ids_[j] - start_id_] = offsets_[j];
1635      }
1636#ifdef ___PREDICT_ENABLED___
1637      if (predicts_[j] >= begin && predicts_[j] < end) {
1638        predicts_[j] -= (begin - dst);
1639      }
1640#endif
1641    }
1642#ifdef ___SYNC_ENABLED___
1643    for (size_t j = 0; j < dict_info_.sync_count; j++) {
1644      if (syncs_[j] >= begin && syncs_[j] < end) {
1645        syncs_[j] -= (begin - dst);
1646      }
1647    }
1648#endif
1649    dst += (end - begin);
1650  }
1651
1652  dict_info_.free_count = 0;
1653  dict_info_.free_size = 0;
1654  dict_info_.lemma_size = dst;
1655  lemma_size_left_ = total_size - dict_info_.lemma_size;
1656  lemma_count_left_ = total_count - dict_info_.lemma_count;
1657
1658  // XXX Without following code,
1659  // offsets_by_id_ is not reordered.
1660  // That's to say, all removed lemmas' ids are not collected back.
1661  // There may not be room for addition of new lemmas due to
1662  // offsests_by_id_ reason, although lemma_size_left_ is fixed.
1663  // By default, we do want defrag as fast as possible, because
1664  // during defrag procedure, other peers can not write new lemmas
1665  // to user dictionary file.
1666  // XXX If write-back is invoked immediately after
1667  // this defragment, no need to fix up following in-mem data.
1668  for (uint32 i = 0; i < dict_info_.lemma_count; i++) {
1669    ids_[i] = start_id_ + i;
1670    offsets_by_id_[i] = offsets_[i];
1671  }
1672
1673  state_ = USER_DICT_DEFRAGMENTED;
1674
1675#ifdef ___DEBUG_PERF___
1676  DEBUG_PERF_END;
1677  LOGD_PERF("defragment");
1678#endif
1679}
1680
1681#ifdef ___SYNC_ENABLED___
1682void UserDict::clear_sync_lemmas(unsigned int start, unsigned int end) {
1683  if (is_valid_state() == false)
1684    return;
1685  if (end > dict_info_.sync_count)
1686    end = dict_info_.sync_count;
1687  memmove(syncs_ + start, syncs_ + end, (dict_info_.sync_count - end) << 2);
1688  dict_info_.sync_count -= (end - start);
1689  if (state_ < USER_DICT_SYNC_DIRTY)
1690    state_ = USER_DICT_SYNC_DIRTY;
1691}
1692
1693int UserDict::get_sync_count() {
1694  if (is_valid_state() == false)
1695    return 0;
1696  return dict_info_.sync_count;
1697}
1698
1699LemmaIdType UserDict::put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
1700                        uint16 lemma_len, uint16 count, uint64 lmt) {
1701  int again = 0;
1702 begin:
1703  LemmaIdType id;
1704  uint32 * syncs_bak = syncs_;
1705  syncs_ = NULL;
1706  id = _put_lemma(lemma_str, splids, lemma_len, count, lmt);
1707  syncs_ = syncs_bak;
1708  if (id == 0 && again == 0) {
1709    if ((dict_info_.limit_lemma_count > 0 &&
1710        dict_info_.lemma_count >= dict_info_.limit_lemma_count)
1711        || (dict_info_.limit_lemma_size > 0 &&
1712            dict_info_.lemma_size + (2 + (lemma_len << 2))
1713            > dict_info_.limit_lemma_size)) {
1714      // XXX Always reclaim and defrag in sync code path
1715      //     sync thread is background thread and ok with heavy work
1716      reclaim();
1717      defragment();
1718      flush_cache();
1719      again = 1;
1720      goto begin;
1721    }
1722  }
1723  return id;
1724}
1725
1726int UserDict::put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len) {
1727  int newly_added = 0;
1728
1729  SpellingParser * spl_parser = new SpellingParser();
1730  if (!spl_parser) {
1731    return 0;
1732  }
1733#ifdef ___DEBUG_PERF___
1734  DEBUG_PERF_BEGIN;
1735#endif
1736  char16 *ptr = lemmas;
1737
1738  // Extract pinyin,words,frequence,last_mod_time
1739  char16 * p = ptr, * py16 = ptr;
1740  char16 * hz16 = NULL;
1741  int py16_len = 0;
1742  uint16 splid[kMaxLemmaSize];
1743  int splid_len = 0;
1744  int hz16_len = 0;
1745  char16 * fr16 = NULL;
1746  int fr16_len = 0;
1747
1748  while (p - ptr < len) {
1749    // Pinyin
1750    py16 = p;
1751    splid_len = 0;
1752    while (*p != 0x2c && (p - ptr) < len) {
1753      if (*p == 0x20)
1754        splid_len++;
1755      p++;
1756    }
1757    splid_len++;
1758    if (p - ptr == len)
1759      break;
1760    py16_len = p - py16;
1761    if (kMaxLemmaSize < splid_len) {
1762      break;
1763    }
1764    bool is_pre;
1765    int splidl = spl_parser->splstr16_to_idxs_f(
1766        py16, py16_len, splid, NULL, kMaxLemmaSize, is_pre);
1767    if (splidl != splid_len)
1768      break;
1769    // Phrase
1770    hz16 = ++p;
1771    while (*p != 0x2c && (p - ptr) < len) {
1772      p++;
1773    }
1774    hz16_len = p - hz16;
1775    if (hz16_len != splid_len)
1776      break;
1777    // Frequency
1778    fr16 = ++p;
1779    fr16_len = 0;
1780    while (*p != 0x2c && (p - ptr) < len) {
1781      p++;
1782    }
1783    fr16_len = p - fr16;
1784    uint32 intf = (uint32)utf16le_atoll(fr16, fr16_len);
1785    // Last modified time
1786    fr16 = ++p;
1787    fr16_len = 0;
1788    while (*p != 0x3b && (p - ptr) < len) {
1789      p++;
1790    }
1791    fr16_len = p - fr16;
1792    uint64 last_mod = utf16le_atoll(fr16, fr16_len);
1793
1794    put_lemma_no_sync(hz16, splid, splid_len, intf, last_mod);
1795    newly_added++;
1796
1797    p++;
1798  }
1799
1800#ifdef ___DEBUG_PERF___
1801  DEBUG_PERF_END;
1802  LOGD_PERF("put_lemmas_no_sync_from_utf16le_string");
1803#endif
1804  return newly_added;
1805}
1806
1807int UserDict::get_sync_lemmas_in_utf16le_string_from_beginning(
1808    char16 * str, int size, int * count) {
1809  int len = 0;
1810  *count = 0;
1811
1812  int left_len = size;
1813
1814  if (is_valid_state() == false)
1815    return len;
1816
1817  SpellingTrie * spl_trie = &SpellingTrie::get_instance();
1818  if (!spl_trie) {
1819    return 0;
1820  }
1821
1822  uint32 i;
1823  for (i = 0; i < dict_info_.sync_count; i++) {
1824    int offset = syncs_[i];
1825    uint32 nchar = get_lemma_nchar(offset);
1826    uint16 *spl = get_lemma_spell_ids(offset);
1827    uint16 *wrd = get_lemma_word(offset);
1828    int score = _get_lemma_score(wrd, spl, nchar);
1829
1830    static char score_temp[32], *pscore_temp = score_temp;
1831    static char16 temp[256], *ptemp = temp;
1832
1833    pscore_temp = score_temp;
1834    ptemp = temp;
1835
1836    uint32 j;
1837    // Add pinyin
1838    for (j = 0; j < nchar; j++) {
1839      int ret_len = spl_trie->get_spelling_str16(
1840          spl[j], ptemp, temp + sizeof(temp) - ptemp);
1841      if (ret_len <= 0)
1842        break;
1843      ptemp += ret_len;
1844      if (ptemp < temp + sizeof(temp) - 1) {
1845        *(ptemp++) = ' ';
1846      } else {
1847        j = 0;
1848        break;
1849      }
1850    }
1851    if (j < nchar) {
1852      continue;
1853    }
1854    ptemp--;
1855    if (ptemp < temp + sizeof(temp) - 1) {
1856      *(ptemp++) = ',';
1857    } else {
1858      continue;
1859    }
1860    // Add phrase
1861    for (j = 0; j < nchar; j++) {
1862      if (ptemp < temp + sizeof(temp) - 1) {
1863        *(ptemp++) = wrd[j];
1864      } else {
1865        break;
1866      }
1867    }
1868    if (j < nchar) {
1869      continue;
1870    }
1871    if (ptemp < temp + sizeof(temp) - 1) {
1872      *(ptemp++) = ',';
1873    } else {
1874      continue;
1875    }
1876    // Add frequency
1877    uint32 intf = extract_score_freq(score);
1878    int ret_len = utf16le_lltoa(intf, ptemp, temp + sizeof(temp) - ptemp);
1879    if (ret_len <= 0)
1880      continue;
1881    ptemp += ret_len;
1882    if (ptemp < temp + sizeof(temp) - 1) {
1883      *(ptemp++) = ',';
1884    } else {
1885      continue;
1886    }
1887    // Add last modified time
1888    uint64 last_mod = extract_score_lmt(score);
1889    ret_len = utf16le_lltoa(last_mod, ptemp, temp + sizeof(temp) - ptemp);
1890    if (ret_len <= 0)
1891      continue;
1892    ptemp += ret_len;
1893    if (ptemp < temp + sizeof(temp) - 1) {
1894      *(ptemp++) = ';';
1895    } else {
1896      continue;
1897    }
1898
1899    // Write to string
1900    int need_len = ptemp - temp;
1901    if (need_len > left_len)
1902      break;
1903    memcpy(str + len, temp, need_len * 2);
1904    left_len -= need_len;
1905
1906    len += need_len;
1907    (*count)++;
1908  }
1909
1910  if (len > 0) {
1911    if (state_ < USER_DICT_SYNC_DIRTY)
1912      state_ = USER_DICT_SYNC_DIRTY;
1913  }
1914  return len;
1915}
1916
1917#endif
1918
1919bool UserDict::state(UserDictStat * stat) {
1920  if (is_valid_state() == false)
1921    return false;
1922  if (!stat)
1923    return false;
1924  stat->version = version_;
1925  stat->file_name = dict_file_;
1926  stat->load_time.tv_sec = load_time_.tv_sec;
1927  stat->load_time.tv_usec = load_time_.tv_usec;
1928  pthread_mutex_lock(&g_mutex_);
1929  stat->last_update.tv_sec = g_last_update_.tv_sec;
1930  stat->last_update.tv_usec = g_last_update_.tv_usec;
1931  pthread_mutex_unlock(&g_mutex_);
1932  stat->disk_size = get_dict_file_size(&dict_info_);
1933  stat->lemma_count = dict_info_.lemma_count;
1934  stat->lemma_size = dict_info_.lemma_size;
1935  stat->delete_count = dict_info_.free_count;
1936  stat->delete_size = dict_info_.free_size;
1937#ifdef ___SYNC_ENABLED___
1938  stat->sync_count = dict_info_.sync_count;
1939#endif
1940  stat->limit_lemma_count = dict_info_.limit_lemma_count;
1941  stat->limit_lemma_size = dict_info_.limit_lemma_size;
1942  stat->reclaim_ratio = dict_info_.reclaim_ratio;
1943  return true;
1944}
1945
1946void UserDict::set_limit(uint32 max_lemma_count,
1947                         uint32 max_lemma_size, uint32 reclaim_ratio) {
1948  dict_info_.limit_lemma_count = max_lemma_count;
1949  dict_info_.limit_lemma_size = max_lemma_size;
1950  if (reclaim_ratio > 100)
1951    reclaim_ratio = 100;
1952  dict_info_.reclaim_ratio = reclaim_ratio;
1953}
1954
1955void UserDict::reclaim() {
1956  if (is_valid_state() == false)
1957    return;
1958
1959  switch (dict_info_.reclaim_ratio) {
1960    case 0:
1961      return;
1962    case 100:
1963      // TODO: CLEAR to be implemented
1964      assert(false);
1965      return;
1966    default:
1967      break;
1968  }
1969
1970  // XXX Reclaim is only based on count, not size
1971  uint32 count = dict_info_.lemma_count;
1972  int rc = count * dict_info_.reclaim_ratio / 100;
1973
1974  UserDictScoreOffsetPair * score_offset_pairs = NULL;
1975  score_offset_pairs = (UserDictScoreOffsetPair *)malloc(
1976      sizeof(UserDictScoreOffsetPair) * rc);
1977  if (score_offset_pairs == NULL) {
1978    return;
1979  }
1980
1981  for (int i = 0; i < rc; i++) {
1982    int s = scores_[i];
1983    score_offset_pairs[i].score = s;
1984    score_offset_pairs[i].offset_index = i;
1985  }
1986
1987  for (int i = (rc + 1) / 2; i >= 0; i--)
1988    shift_down(score_offset_pairs, i, rc);
1989
1990  for (uint32 i = rc; i < dict_info_.lemma_count; i++) {
1991    int s = scores_[i];
1992    if (s < score_offset_pairs[0].score) {
1993      score_offset_pairs[0].score = s;
1994      score_offset_pairs[0].offset_index = i;
1995      shift_down(score_offset_pairs, 0, rc);
1996    }
1997  }
1998
1999  for (int i = 0; i < rc; i++) {
2000    int off = score_offset_pairs[i].offset_index;
2001    remove_lemma_by_offset_index(off);
2002  }
2003  if (rc > 0) {
2004    if (state_ < USER_DICT_OFFSET_DIRTY)
2005      state_ = USER_DICT_OFFSET_DIRTY;
2006  }
2007
2008  free(score_offset_pairs);
2009}
2010
2011inline void UserDict::swap(UserDictScoreOffsetPair * sop, int i, int j) {
2012  int s = sop[i].score;
2013  int p = sop[i].offset_index;
2014  sop[i].score = sop[j].score;
2015  sop[i].offset_index = sop[j].offset_index;
2016  sop[j].score = s;
2017  sop[j].offset_index = p;
2018}
2019
2020void UserDict::shift_down(UserDictScoreOffsetPair * sop, int i, int n) {
2021  int par = i;
2022  while (par < n) {
2023    int left = par * 2 + 1;
2024    int right = left + 1;
2025    if (left >= n && right >= n)
2026      break;
2027    if (right >= n) {
2028      if (sop[left].score > sop[par].score) {
2029        swap(sop, left, par);
2030        par = left;
2031        continue;
2032      }
2033    } else if (sop[left].score > sop[right].score &&
2034               sop[left].score > sop[par].score) {
2035      swap(sop, left, par);
2036      par = left;
2037      continue;
2038    } else if (sop[right].score > sop[left].score &&
2039               sop[right].score > sop[par].score) {
2040      swap(sop, right, par);
2041      par = right;
2042      continue;
2043    }
2044    break;
2045  }
2046}
2047
2048LemmaIdType UserDict::put_lemma(char16 lemma_str[], uint16 splids[],
2049                                uint16 lemma_len, uint16 count) {
2050  return _put_lemma(lemma_str, splids, lemma_len, count, time(NULL));
2051}
2052
2053LemmaIdType UserDict::_put_lemma(char16 lemma_str[], uint16 splids[],
2054                                 uint16 lemma_len, uint16 count, uint64 lmt) {
2055#ifdef ___DEBUG_PERF___
2056  DEBUG_PERF_BEGIN;
2057#endif
2058  if (is_valid_state() == false)
2059    return 0;
2060  int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2061  if (off != -1) {
2062    int delta_score = count - scores_[off];
2063    dict_info_.total_nfreq += delta_score;
2064    scores_[off] = build_score(lmt, count);
2065    if (state_ < USER_DICT_SCORE_DIRTY)
2066      state_ = USER_DICT_SCORE_DIRTY;
2067#ifdef ___DEBUG_PERF___
2068    DEBUG_PERF_END;
2069    LOGD_PERF("_put_lemma(update)");
2070#endif
2071    return ids_[off];
2072  } else {
2073    if ((dict_info_.limit_lemma_count > 0 &&
2074        dict_info_.lemma_count >= dict_info_.limit_lemma_count)
2075        || (dict_info_.limit_lemma_size > 0 &&
2076            dict_info_.lemma_size + (2 + (lemma_len << 2))
2077            > dict_info_.limit_lemma_size)) {
2078      // XXX Don't defragment here, it's too time-consuming.
2079      return 0;
2080    }
2081    int flushed = 0;
2082    if (lemma_count_left_ == 0 ||
2083        lemma_size_left_ < (size_t)(2 + (lemma_len << 2))) {
2084
2085      // XXX When there is no space for new lemma, we flush to disk
2086      // flush_cache() may be called by upper user
2087      // and better place shoule be found instead of here
2088      flush_cache();
2089      flushed = 1;
2090      // Or simply return and do nothing
2091      // return 0;
2092    }
2093#ifdef ___DEBUG_PERF___
2094    DEBUG_PERF_END;
2095    LOGD_PERF(flushed ? "_put_lemma(flush+add)" : "_put_lemma(add)");
2096#endif
2097    LemmaIdType id = append_a_lemma(lemma_str, splids, lemma_len, count, lmt);
2098#ifdef ___SYNC_ENABLED___
2099    if (syncs_ && id != 0) {
2100      queue_lemma_for_sync(id);
2101    }
2102#endif
2103    return id;
2104  }
2105  return 0;
2106}
2107
2108#ifdef ___SYNC_ENABLED___
2109void UserDict::queue_lemma_for_sync(LemmaIdType id) {
2110  if (dict_info_.sync_count < sync_count_size_) {
2111    syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2112  } else {
2113    uint32 * syncs = (uint32*)realloc(
2114        syncs_, (sync_count_size_ + kUserDictPreAlloc) << 2);
2115    if (syncs) {
2116      sync_count_size_ += kUserDictPreAlloc;
2117      syncs_ = syncs;
2118      syncs_[dict_info_.sync_count++] = offsets_by_id_[id - start_id_];
2119    }
2120  }
2121}
2122#endif
2123
2124LemmaIdType UserDict::update_lemma(LemmaIdType lemma_id, int16 delta_count,
2125                                   bool selected) {
2126#ifdef ___DEBUG_PERF___
2127  DEBUG_PERF_BEGIN;
2128#endif
2129  if (is_valid_state() == false)
2130    return 0;
2131  if (is_valid_lemma_id(lemma_id) == false)
2132    return 0;
2133  uint32 offset = offsets_by_id_[lemma_id - start_id_];
2134  uint8 lemma_len = get_lemma_nchar(offset);
2135  char16 * lemma_str = get_lemma_word(offset);
2136  uint16 * splids = get_lemma_spell_ids(offset);
2137
2138  int32 off = locate_in_offsets(lemma_str, splids, lemma_len);
2139  if (off != -1) {
2140    int score = scores_[off];
2141    int count = extract_score_freq(score);
2142    uint64 lmt = extract_score_lmt(score);
2143    if (count + delta_count > kUserDictMaxFrequency ||
2144        count + delta_count < count) {
2145      delta_count = kUserDictMaxFrequency - count;
2146    }
2147    count += delta_count;
2148    dict_info_.total_nfreq += delta_count;
2149    if (selected) {
2150      lmt = time(NULL);
2151    }
2152    scores_[off] = build_score(lmt, count);
2153    if (state_ < USER_DICT_SCORE_DIRTY)
2154      state_ = USER_DICT_SCORE_DIRTY;
2155#ifdef ___DEBUG_PERF___
2156    DEBUG_PERF_END;
2157    LOGD_PERF("update_lemma");
2158#endif
2159#ifdef ___SYNC_ENABLED___
2160    queue_lemma_for_sync(ids_[off]);
2161#endif
2162    return ids_[off];
2163  }
2164  return 0;
2165}
2166
2167size_t UserDict::get_total_lemma_count() {
2168  return dict_info_.total_nfreq;
2169}
2170
2171void UserDict::set_total_lemma_count_of_others(size_t count) {
2172  total_other_nfreq_ = count;
2173}
2174
2175LemmaIdType UserDict::append_a_lemma(char16 lemma_str[], uint16 splids[],
2176                                   uint16 lemma_len, uint16 count, uint64 lmt) {
2177  LemmaIdType id = get_max_lemma_id() + 1;
2178  size_t offset = dict_info_.lemma_size;
2179  if (offset > kUserDictOffsetMask)
2180    return 0;
2181
2182  lemmas_[offset] = 0;
2183  lemmas_[offset + 1] = (uint8)lemma_len;
2184  for (size_t i = 0; i < lemma_len; i++) {
2185    *((uint16*)&lemmas_[offset + 2 + (i << 1)]) = splids[i];
2186    *((char16*)&lemmas_[offset + 2 + (lemma_len << 1) + (i << 1)])
2187        = lemma_str[i];
2188  }
2189  uint32 off = dict_info_.lemma_count;
2190  offsets_[off] = offset;
2191  scores_[off] = build_score(lmt, count);
2192  ids_[off] = id;
2193#ifdef ___PREDICT_ENABLED___
2194  predicts_[off] = offset;
2195#endif
2196
2197  offsets_by_id_[id - start_id_] = offset;
2198
2199  dict_info_.lemma_count++;
2200  dict_info_.lemma_size += (2 + (lemma_len << 2));
2201  lemma_count_left_--;
2202  lemma_size_left_ -= (2 + (lemma_len << 2));
2203
2204  // Sort
2205
2206  UserDictSearchable searchable;
2207  prepare_locate(&searchable, splids, lemma_len);
2208
2209  size_t i = 0;
2210  while (i < off) {
2211    offset = offsets_[i];
2212    uint32 nchar = get_lemma_nchar(offset);
2213    uint16 * spl = get_lemma_spell_ids(offset);
2214
2215    if (0 <= fuzzy_compare_spell_id(spl, nchar, &searchable))
2216      break;
2217    i++;
2218  }
2219  if (i != off) {
2220    uint32 temp = offsets_[off];
2221    memmove(offsets_ + i + 1, offsets_ + i, (off - i) << 2);
2222    offsets_[i] = temp;
2223
2224    temp = scores_[off];
2225    memmove(scores_ + i + 1, scores_ + i, (off - i) << 2);
2226    scores_[i] = temp;
2227
2228    temp = ids_[off];
2229    memmove(ids_ + i + 1, ids_ + i, (off - i) << 2);
2230    ids_[i] = temp;
2231  }
2232
2233#ifdef ___PREDICT_ENABLED___
2234  uint32 j = 0;
2235  uint16 * words_new = get_lemma_word(predicts_[off]);
2236  j = locate_where_to_insert_in_predicts(words_new, lemma_len);
2237  if (j != off) {
2238    uint32 temp = predicts_[off];
2239    memmove(predicts_ + j + 1, predicts_ + j, (off - j) << 2);
2240    predicts_[j] = temp;
2241  }
2242#endif
2243
2244  if (state_ < USER_DICT_LEMMA_DIRTY)
2245    state_ = USER_DICT_LEMMA_DIRTY;
2246
2247#ifdef ___CACHE_ENABLED___
2248  cache_init();
2249#endif
2250
2251  dict_info_.total_nfreq += count;
2252  return id;
2253}
2254}
2255