1/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef PINYINIME_INCLUDE_DICTDEF_H__
18#define PINYINIME_INCLUDE_DICTDEF_H__
19
20#include <stdlib.h>
21#include "./utf16char.h"
22
23namespace ime_pinyin {
24
25// Enable the following line when building the binary dictionary model.
26// #define ___BUILD_MODEL___
27
28typedef unsigned char      uint8;
29typedef unsigned short     uint16;
30typedef unsigned int       uint32;
31
32typedef signed char        int8;
33typedef short              int16;
34typedef int                int32;
35typedef long long          int64;
36typedef unsigned long long uint64;
37
38const bool kPrintDebug0 = false;
39const bool kPrintDebug1 = false;
40const bool kPrintDebug2 = false;
41
42// The max length of a lemma.
43const size_t kMaxLemmaSize = 8;
44
45// The max length of a Pinyin (spelling).
46const size_t kMaxPinyinSize = 6;
47
48// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
49// See SpellingTrie.h for details.
50const size_t kHalfSpellingIdNum = 29;
51
52// The maximum number of full spellings. For Chinese Pinyin, there are only
53// about 410 spellings.
54// If change this value is bigger(needs more bits), please also update
55// other structures like SpellingNode, to make sure than a spelling id can be
56// stored.
57// -1 is because that 0 is never used.
58const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
59const size_t kMaxSearchSteps = 40;
60
61// One character predicts its following characters.
62const size_t kMaxPredictSize = (kMaxLemmaSize - 1);
63
64// LemmaIdType must always be size_t.
65typedef size_t LemmaIdType;
66const size_t kLemmaIdSize = 3;  // Actually, a Id occupies 3 bytes in storage.
67const size_t kLemmaIdComposing = 0xffffff;
68
69typedef uint16 LmaScoreType;
70typedef uint16 KeyScoreType;
71
72// Number of items with highest score are kept for prediction purpose.
73const size_t kTopScoreLemmaNum = 10;
74
75const size_t kMaxPredictNumByGt3 = 1;
76const size_t kMaxPredictNumBy3 = 2;
77const size_t kMaxPredictNumBy2 = 2;
78
79// The last lemma id (included) for the system dictionary. The system
80// dictionary's ids always start from 1.
81const LemmaIdType kSysDictIdEnd = 500000;
82
83// The first lemma id for the user dictionary.
84const LemmaIdType kUserDictIdStart = 500001;
85
86// The last lemma id (included) for the user dictionary.
87const LemmaIdType kUserDictIdEnd = 600000;
88
89typedef struct {
90  uint16 half_splid:5;
91  uint16 full_splid:11;
92} SpellingId, *PSpellingId;
93
94
95/**
96 * We use different node types for different layers
97 * Statistical data of the building result for a testing dictionary:
98 *                              root,   level 0,   level 1,   level 2,   level 3
99 * max son num of one node:     406        280         41          2          -
100 * max homo num of one node:      0         90         23          2          2
101 * total node num of a layer:     1        406      31766      13516        993
102 * total homo num of a layer:     9       5674      44609      12667        995
103 *
104 * The node number for root and level 0 won't be larger than 500
105 * According to the information above, two kinds of nodes can be used; one for
106 * root and level 0, the other for these layers deeper than 0.
107 *
108 * LE = less and equal,
109 * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
110 */
111struct LmaNodeLE0 {
112  size_t son_1st_off;
113  size_t homo_idx_buf_off;
114  uint16 spl_idx;
115  uint16 num_of_son;
116  uint16 num_of_homo;
117};
118
119/**
120 * GE = great and equal
121 * A node occupies 8 bytes.
122 */
123struct LmaNodeGE1 {
124  uint16 son_1st_off_l;        // Low bits of the son_1st_off
125  uint16 homo_idx_buf_off_l;   // Low bits of the homo_idx_buf_off_1
126  uint16 spl_idx;
127  unsigned char num_of_son;            // number of son nodes
128  unsigned char num_of_homo;           // number of homo words
129  unsigned char son_1st_off_h;         // high bits of the son_1st_off
130  unsigned char homo_idx_buf_off_h;    // high bits of the homo_idx_buf_off
131};
132
133#ifdef ___BUILD_MODEL___
134struct SingleCharItem {
135  float freq;
136  char16 hz;
137  SpellingId splid;
138};
139
140struct LemmaEntry {
141  LemmaIdType idx_by_py;
142  LemmaIdType idx_by_hz;
143  char16 hanzi_str[kMaxLemmaSize + 1];
144
145  // The SingleCharItem id for each Hanzi.
146  uint16 hanzi_scis_ids[kMaxLemmaSize];
147
148  uint16 spl_idx_arr[kMaxLemmaSize + 1];
149  char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
150  unsigned char hz_str_len;
151  float freq;
152};
153#endif  // ___BUILD_MODEL___
154
155}  //  namespace ime_pinyin
156
157#endif  // PINYINIME_INCLUDE_DICTDEF_H__
158