1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef PINYINIME_INCLUDE_USERDICT_H__
18#define PINYINIME_INCLUDE_USERDICT_H__
19
20#define ___CACHE_ENABLED___
21#define ___SYNC_ENABLED___
22#define ___PREDICT_ENABLED___
23
24// Debug performance for operations
25// #define ___DEBUG_PERF___
26
27#include <pthread.h>
28#include "atomdictbase.h"
29
30namespace ime_pinyin {
31
32class UserDict : public AtomDictBase {
33 public:
34  UserDict();
35  ~UserDict();
36
37  bool load_dict(const char *file_name, LemmaIdType start_id,
38                 LemmaIdType end_id);
39
40  bool close_dict();
41
42  size_t number_of_lemmas();
43
44  void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
45
46  MileStoneHandle extend_dict(MileStoneHandle from_handle,
47                              const DictExtPara *dep, LmaPsbItem *lpi_items,
48                              size_t lpi_max, size_t *lpi_num);
49
50  size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
51                  LmaPsbItem *lpi_items, size_t lpi_max);
52
53  uint16 get_lemma_str(LemmaIdType id_lemma, char16* str_buf,
54                       uint16 str_max);
55
56  uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
57                          uint16 splids_max, bool arg_valid);
58
59  size_t predict(const char16 last_hzs[], uint16 hzs_len,
60                 NPredictItem *npre_items, size_t npre_max,
61                 size_t b4_used);
62
63  // Full spelling ids are required
64  LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
65                        uint16 lemma_len, uint16 count);
66
67  LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
68                           bool selected);
69
70  LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
71                           uint16 lemma_len);
72
73  LmaScoreType get_lemma_score(LemmaIdType lemma_id);
74
75  LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
76                        uint16 lemma_len);
77
78  bool remove_lemma(LemmaIdType lemma_id);
79
80  size_t get_total_lemma_count();
81  void set_total_lemma_count_of_others(size_t count);
82
83  void flush_cache();
84
85  void set_limit(uint32 max_lemma_count, uint32 max_lemma_size,
86                 uint32 reclaim_ratio);
87
88  void reclaim();
89
90  void defragment();
91
92#ifdef ___SYNC_ENABLED___
93  void clear_sync_lemmas(unsigned int start, unsigned int end);
94
95  int get_sync_count();
96
97  LemmaIdType put_lemma_no_sync(char16 lemma_str[], uint16 splids[],
98                        uint16 lemma_len, uint16 count, uint64 lmt);
99   /**
100    * Add lemmas encoded in UTF-16LE into dictionary without adding sync flag.
101    *
102    * @param lemmas in format of 'wo men,WM,0.32;da jia,DJ,0.12'
103    * @param len length of lemmas string in UTF-16LE
104    * @return newly added lemma count
105    */
106  int put_lemmas_no_sync_from_utf16le_string(char16 * lemmas, int len);
107
108  /**
109   * Get lemmas need sync to a UTF-16LE string of above format.
110   * Note: input buffer (str) must not be too small. If str is too small to
111   *       contain single one lemma, there might be a dead loop.
112   *
113   * @param str buffer to write lemmas
114   * @param size buffer size in UTF-16LE
115   * @param count output value of lemma returned
116   * @return UTF-16LE string length
117   */
118  int get_sync_lemmas_in_utf16le_string_from_beginning(
119      char16 * str, int size, int * count);
120
121#endif
122
123  struct UserDictStat {
124    uint32 version;
125    const char * file_name;
126    struct timeval load_time;
127    struct timeval last_update;
128    uint32 disk_size;
129    uint32 lemma_count;
130    uint32 lemma_size;
131    uint32 delete_count;
132    uint32 delete_size;
133#ifdef ___SYNC_ENABLED___
134    uint32 sync_count;
135#endif
136    uint32 reclaim_ratio;
137    uint32 limit_lemma_count;
138    uint32 limit_lemma_size;
139  };
140
141  bool state(UserDictStat * stat);
142
143 private:
144  uint32 total_other_nfreq_;
145  struct timeval load_time_;
146  LemmaIdType start_id_;
147  uint32 version_;
148  uint8 * lemmas_;
149
150  // In-Memory-Only flag for each lemma
151  static const uint8 kUserDictLemmaFlagRemove = 1;
152  // Inuse lemmas' offset
153  uint32 * offsets_;
154  // Highest bit in offset tells whether corresponding lemma is removed
155  static const uint32 kUserDictOffsetFlagRemove = (1 << 31);
156  // Maximum possible for the offset
157  static const uint32 kUserDictOffsetMask = ~(kUserDictOffsetFlagRemove);
158  // Bit width for last modified time, from 1 to 16
159  static const uint32 kUserDictLMTBitWidth = 16;
160  // Granularity for last modified time in second
161  static const uint32 kUserDictLMTGranularity = 60 * 60 * 24 * 7;
162  // Maximum frequency count
163  static const uint16 kUserDictMaxFrequency = 0xFFFF;
164
165#define COARSE_UTC(year, month, day, hour, minute, second) \
166  ( \
167    (year - 1970) * 365 * 24 * 60 * 60 + \
168    (month - 1) * 30 * 24 * 60 * 60 + \
169    (day - 1) * 24 * 60 * 60 + \
170    (hour - 0) * 60 * 60 + \
171    (minute - 0) * 60 + \
172    (second - 0) \
173  )
174  static const uint64 kUserDictLMTSince = COARSE_UTC(2009, 1, 1, 0, 0, 0);
175
176  // Correspond to offsets_
177  uint32 * scores_;
178  // Following two fields are only valid in memory
179  uint32 * ids_;
180#ifdef ___PREDICT_ENABLED___
181  uint32 * predicts_;
182#endif
183#ifdef ___SYNC_ENABLED___
184  uint32 * syncs_;
185  size_t sync_count_size_;
186#endif
187  uint32 * offsets_by_id_;
188
189  size_t lemma_count_left_;
190  size_t lemma_size_left_;
191
192  const char * dict_file_;
193
194  // Be sure size is 4xN
195  struct UserDictInfo {
196    // When limitation reached, how much percentage will be reclaimed (1 ~ 100)
197    uint32 reclaim_ratio;
198    // maximum lemma count, 0 means no limitation
199    uint32 limit_lemma_count;
200    // Maximum lemma size, it's different from
201    // whole disk file size or in-mem dict size
202    // 0 means no limitation
203    uint32 limit_lemma_size;
204    // Total lemma count including deleted and inuse
205    // Also indicate offsets_ size
206    uint32 lemma_count;
207    // Total size of lemmas including used and freed
208    uint32 lemma_size;
209    // Freed lemma count
210    uint32 free_count;
211    // Freed lemma size in byte
212    uint32 free_size;
213#ifdef ___SYNC_ENABLED___
214    uint32 sync_count;
215#endif
216    int32 total_nfreq;
217  } dict_info_;
218
219  static const uint32 kUserDictVersion = 0x0ABCDEF0;
220
221  static const uint32 kUserDictPreAlloc = 32;
222  static const uint32 kUserDictAverageNchar = 8;
223
224  enum UserDictState {
225    // Keep in order
226    USER_DICT_NONE = 0,
227    USER_DICT_SYNC,
228#ifdef ___SYNC_ENABLED___
229    USER_DICT_SYNC_DIRTY,
230#endif
231    USER_DICT_SCORE_DIRTY,
232    USER_DICT_OFFSET_DIRTY,
233    USER_DICT_LEMMA_DIRTY,
234
235    USER_DICT_DEFRAGMENTED,
236  } state_;
237
238  struct UserDictSearchable {
239    uint16 splids_len;
240    uint16 splid_start[kMaxLemmaSize];
241    uint16 splid_count[kMaxLemmaSize];
242    // Compact inital letters for both FuzzyCompareSpellId and cache system
243    uint32 signature[kMaxLemmaSize / 4];
244  };
245
246#ifdef ___CACHE_ENABLED___
247  enum UserDictCacheType {
248    USER_DICT_CACHE,
249    USER_DICT_MISS_CACHE,
250  };
251
252  static const int kUserDictCacheSize = 4;
253  static const int kUserDictMissCacheSize = kMaxLemmaSize - 1;
254
255  struct UserDictMissCache {
256    uint32 signatures[kUserDictMissCacheSize][kMaxLemmaSize / 4];
257    uint16 head, tail;
258  } miss_caches_[kMaxLemmaSize];
259
260  struct UserDictCache {
261    uint32 signatures[kUserDictCacheSize][kMaxLemmaSize / 4];
262    uint32 offsets[kUserDictCacheSize];
263    uint32 lengths[kUserDictCacheSize];
264    // Ring buffer
265    uint16 head, tail;
266  } caches_[kMaxLemmaSize];
267
268  void cache_init();
269
270  void cache_push(UserDictCacheType type,
271                 UserDictSearchable *searchable,
272                 uint32 offset, uint32 length);
273
274  bool cache_hit(UserDictSearchable *searchable,
275                 uint32 *offset, uint32 *length);
276
277  bool load_cache(UserDictSearchable *searchable,
278                  uint32 *offset, uint32 *length);
279
280  void save_cache(UserDictSearchable *searchable,
281                  uint32 offset, uint32 length);
282
283  void reset_cache();
284
285  bool load_miss_cache(UserDictSearchable *searchable);
286
287  void save_miss_cache(UserDictSearchable *searchable);
288
289  void reset_miss_cache();
290#endif
291
292  LmaScoreType translate_score(int f);
293
294  int extract_score_freq(int raw_score);
295
296  uint64 extract_score_lmt(int raw_score);
297
298  inline int build_score(uint64 lmt, int freq);
299
300  inline int64 utf16le_atoll(uint16 *s, int len);
301
302  inline int utf16le_lltoa(int64 v, uint16 *s, int size);
303
304  LemmaIdType _put_lemma(char16 lemma_str[], uint16 splids[],
305                        uint16 lemma_len, uint16 count, uint64 lmt);
306
307  size_t _get_lpis(const uint16 *splid_str, uint16 splid_str_len,
308                   LmaPsbItem *lpi_items, size_t lpi_max, bool * need_extend);
309
310  int _get_lemma_score(char16 lemma_str[], uint16 splids[], uint16 lemma_len);
311
312  int _get_lemma_score(LemmaIdType lemma_id);
313
314  int is_fuzzy_prefix_spell_id(const uint16 * id1, uint16 len1,
315                               const UserDictSearchable *searchable);
316
317  bool is_prefix_spell_id(const uint16 * fullids,
318                          uint16 fulllen, const UserDictSearchable *searchable);
319
320  uint32 get_dict_file_size(UserDictInfo * info);
321
322  bool reset(const char *file);
323
324  bool validate(const char *file);
325
326  bool load(const char *file, LemmaIdType start_id);
327
328  bool is_valid_state();
329
330  bool is_valid_lemma_id(LemmaIdType id);
331
332  LemmaIdType get_max_lemma_id();
333
334  void set_lemma_flag(uint32 offset, uint8 flag);
335
336  char get_lemma_flag(uint32 offset);
337
338  char get_lemma_nchar(uint32 offset);
339
340  uint16 * get_lemma_spell_ids(uint32 offset);
341
342  uint16 * get_lemma_word(uint32 offset);
343
344  // Prepare searchable to fasten locate process
345  void prepare_locate(UserDictSearchable *searchable,
346                      const uint16 * splids, uint16 len);
347
348  // Compare initial letters only
349  int32 fuzzy_compare_spell_id(const uint16 * id1, uint16 len1,
350                               const UserDictSearchable *searchable);
351
352  // Compare exactly two spell ids
353  // First argument must be a full id spell id
354  bool equal_spell_id(const uint16 * fullids,
355                      uint16 fulllen, const UserDictSearchable *searchable);
356
357  // Find first item by initial letters
358  int32 locate_first_in_offsets(const UserDictSearchable *searchable);
359
360  LemmaIdType append_a_lemma(char16 lemma_str[], uint16 splids[],
361                           uint16 lemma_len, uint16 count, uint64 lmt);
362
363  // Check if a lemma is in dictionary
364  int32 locate_in_offsets(char16 lemma_str[],
365                          uint16 splid_str[], uint16 lemma_len);
366
367  bool remove_lemma_by_offset_index(int offset_index);
368#ifdef ___PREDICT_ENABLED___
369  uint32 locate_where_to_insert_in_predicts(const uint16 * words,
370                                            int lemma_len);
371
372  int32 locate_first_in_predicts(const uint16 * words, int lemma_len);
373
374  void remove_lemma_from_predict_list(uint32 offset);
375#endif
376#ifdef ___SYNC_ENABLED___
377  void queue_lemma_for_sync(LemmaIdType id);
378
379  void remove_lemma_from_sync_list(uint32 offset);
380
381  void write_back_sync(int fd);
382#endif
383  void write_back_score(int fd);
384  void write_back_offset(int fd);
385  void write_back_lemma(int fd);
386  void write_back_all(int fd);
387  void write_back();
388
389  struct UserDictScoreOffsetPair {
390    int score;
391    uint32 offset_index;
392  };
393
394  inline void swap(UserDictScoreOffsetPair * sop, int i, int j);
395
396  void shift_down(UserDictScoreOffsetPair * sop, int i, int n);
397
398  // On-disk format for each lemma
399  // +-------------+
400  // | Version (4) |
401  // +-------------+
402  // +-----------+-----------+--------------------+-------------------+
403  // | Spare (1) | Nchar (1) | Splids (2 x Nchar) | Lemma (2 x Nchar) |
404  // +-----------+-----------+--------------------+-------------------+
405  // ...
406  // +-----------------------+     +-------------+      <---Offset of offset
407  // | Offset1 by_splids (4) | ... | OffsetN (4) |
408  // +-----------------------+     +-------------+
409#ifdef ___PREDICT_ENABLED___
410  // +----------------------+     +-------------+
411  // | Offset1 by_lemma (4) | ... | OffsetN (4) |
412  // +----------------------+     +-------------+
413#endif
414  // +------------+     +------------+
415  // | Score1 (4) | ... | ScoreN (4) |
416  // +------------+     +------------+
417#ifdef ___SYNC_ENABLED___
418  // +-------------+     +-------------+
419  // | NewAdd1 (4) | ... | NewAddN (4) |
420  // +-------------+     +-------------+
421#endif
422  // +----------------+
423  // | Dict Info (4x) |
424  // +----------------+
425};
426}
427
428#endif
429