dictdef.h revision 7898d76cc005bbe1c5893a9f57439561e0771cc
17898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project/*
27898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Copyright (C) 2009 The Android Open Source Project
37898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
47898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Licensed under the Apache License, Version 2.0 (the "License");
57898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * you may not use this file except in compliance with the License.
67898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * You may obtain a copy of the License at
77898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
87898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *      http://www.apache.org/licenses/LICENSE-2.0
97898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Unless required by applicable law or agreed to in writing, software
117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * distributed under the License is distributed on an "AS IS" BASIS,
127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * See the License for the specific language governing permissions and
147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * limitations under the License.
157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project */
167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifndef PINYINIME_INCLUDE_DICTDEF_H__
187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#define PINYINIME_INCLUDE_DICTDEF_H__
197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <stdlib.h>
217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include "./utf16char.h"
227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectnamespace ime_pinyin {
247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// Enable the following line when building the binary dictionary model.
267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// #define ___BUILD_MODEL___
277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef unsigned char      uint8;
297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef unsigned short     uint16;
307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef unsigned int       uint32;
317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef signed char        int8;
337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef short              int16;
347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef int                int32;
357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef long long          int64;
367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef unsigned long long uint64;
377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst bool kPrintDebug0 = false;
397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst bool kPrintDebug1 = false;
407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst bool kPrintDebug2 = false;
417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The max length of a lemma.
437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kMaxLemmaSize = 8;
447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The max length of a Pinyin (spelling).
467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kMaxPinyinSize = 6;
477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// See SpellingTrie.h for details.
507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kHalfSpellingIdNum = 29;
517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The maximum number of full spellings. For Chinese Pinyin, there are only
537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// about 410 spellings.
547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// If change this value is bigger(needs more bits), please also update
557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// other structures like SpellingNode, to make sure than a spelling id can be
567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// stored.
577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// -1 is because that 0 is never used.
587898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
597898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kMaxSearchSteps = 40;
607898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
617898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// One character predicts its following characters.
627898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kMaxPredictSize = (kMaxLemmaSize - 1);
637898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
647898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// LemmaIdType must always be size_t.
657898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef size_t LemmaIdType;
667898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kLemmaIdSize = 3;  // Actually, a Id occupies 3 bytes in storage.
677898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kLemmaIdComposing = 0xffffff;
687898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
697898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef uint16 LmaScoreType;
707898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef uint16 KeyScoreType;
717898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
727898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// Number of items with highest score are kept for prediction purpose.
737898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kTopScoreLemmaNum = 10;
747898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
757898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kMaxPredictNumByGt3 = 1;
767898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kMaxPredictNumBy3 = 2;
777898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst size_t kMaxPredictNumBy2 = 2;
787898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
797898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The last lemma id (included) for the system dictionary. The system
807898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// dictionary's ids always start from 1.
817898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst LemmaIdType kSysDictIdEnd = 500000;
827898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
837898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The first lemma id for the user dictionary.
847898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst LemmaIdType kUserDictIdStart = 500001;
857898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
867898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project// The last lemma id (included) for the user dictionary.
877898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectconst LemmaIdType kUserDictIdEnd = 600000;
887898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
897898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef struct {
907898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 half_splid:5;
917898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 full_splid:11;
927898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project} SpellingId, *PSpellingId;
937898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
947898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
957898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project/**
967898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * We use different node types for different layers
977898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Statistical data of the building result for a testing dictionary:
987898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *                              root,   level 0,   level 1,   level 2,   level 3
997898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * max son num of one node:     406        280         41          2          -
1007898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * max homo num of one node:      0         90         23          2          2
1017898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * total node num of a layer:     1        406      31766      13516        993
1027898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * total homo num of a layer:     9       5674      44609      12667        995
1037898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
1047898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * The node number for root and level 0 won't be larger than 500
1057898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * According to the information above, two kinds of nodes can be used; one for
1067898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * root and level 0, the other for these layers deeper than 0.
1077898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
1087898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * LE = less and equal,
1097898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
1107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project */
1117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectstruct LmaNodeLE0 {
1127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  size_t son_1st_off;
1137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  size_t homo_idx_buf_off;
1147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 spl_idx;
1157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 num_of_son;
1167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 num_of_homo;
1177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project};
1187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project/**
1207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * GE = great and equal
1217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * A node occupies 8 bytes.
1227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project */
1237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectstruct LmaNodeGE1 {
1247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 son_1st_off_l;        // Low bits of the son_1st_off
1257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 homo_idx_buf_off_l;   // Low bits of the homo_idx_buf_off_1
1267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 spl_idx;
1277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  unsigned char num_of_son;            // number of son nodes
1287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  unsigned char num_of_homo;           // number of homo words
1297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  unsigned char son_1st_off_h;         // high bits of the son_1st_off
1307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  unsigned char homo_idx_buf_off_h;    // high bits of the homo_idx_buf_off
1317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project};
1327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___
1347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectstruct SingleCharItem {
1357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  float freq;
1367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  char16 hz;
1377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  SpellingId splid;
1387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project};
1397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectstruct LemmaEntry {
1417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  LemmaIdType idx_by_py;
1427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  LemmaIdType idx_by_hz;
1437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  char16 hanzi_str[kMaxLemmaSize + 1];
1447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // The SingleCharItem id for each Hanzi.
1467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 hanzi_scis_ids[kMaxLemmaSize];
1477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  uint16 spl_idx_arr[kMaxLemmaSize + 1];
1497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
1507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  unsigned char hz_str_len;
1517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  float freq;
1527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project};
1537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif  // ___BUILD_MODEL___
1547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}  //  namespace ime_pinyin
1567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
1577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif  // PINYINIME_INCLUDE_DICTDEF_H__
158