FormatSpec.java revision 1a347723c5ad4a71076df67f3af3b702db205719
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.latin.Constants;
20import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
21
22/**
23 * Dictionary File Format Specification.
24 */
25public final class FormatSpec {
26
27    /*
28     * Array of Node(FusionDictionary.Node) layout is as follows:
29     *
30     * g |
31     * r | the number of groups, 1 or 2 bytes.
32     * o | 1 byte = bbbbbbbb match
33     * u |   case 1xxxxxxx => xxxxxxx << 8 + next byte
34     * p |   otherwise => bbbbbbbb
35     * c |
36     * ount
37     *
38     * g |
39     * r | sequence of groups,
40     * o | the layout of each group is described below.
41     * u |
42     * ps
43     *
44     */
45
46    /* Node(CharGroup) layout is as follows:
47     *   | addressType                         xx     : mask with MASK_GROUP_ADDRESS_TYPE
48     *                                 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
49     * f |                                     01 = 1 byte      : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
50     * l |                                     10 = 2 bytes     : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
51     * a |                                     11 = 3 bytes     : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
52     * g | has several chars ?         1 bit, 1 = yes, 0 = no   : FLAG_HAS_MULTIPLE_CHARS
53     * s | has a terminal ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_TERMINAL
54     *   | has shortcut targets ?      1 bit, 1 = yes, 0 = no   : FLAG_HAS_SHORTCUT_TARGETS
55     *   | has bigrams ?               1 bit, 1 = yes, 0 = no   : FLAG_HAS_BIGRAMS
56     *   | is not a word ?             1 bit, 1 = yes, 0 = no   : FLAG_IS_NOT_A_WORD
57     *   | is blacklisted ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_BLACKLISTED
58     *
59     * p |
60     * a | IF HAS_PARENT_ADDRESS (defined in the file header)
61     * r |     parent address, 3byte
62     * e | the address must be negative, so the absolute value of the address is stored.
63     * n |
64     * taddress
65     *
66     * c | IF FLAG_HAS_MULTIPLE_CHARS
67     * h |   char, char, char, char    n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
68     * a |   end                       1 byte, = 0
69     * r | ELSE
70     * s |   char                      1 or 3 bytes
71     *   | END
72     *
73     * f |
74     * r | IF FLAG_IS_TERMINAL
75     * e |   frequency                 1 byte
76     * q |
77     *
78     * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType
79     * h |   // nothing
80     * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType
81     * l |   children address, 1 byte
82     * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType
83     * r |   children address, 2 bytes
84     * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType
85     * n |   children address, 3 bytes
86     * A | END
87     * d
88     * dress
89     *
90     *   | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
91     *   | shortcut string list
92     *   | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS
93     *   | bigrams address list
94     *
95     * Char format is:
96     * 1 byte = bbbbbbbb match
97     * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
98     * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
99     *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
100     *       00011111 would be outside unicode.
101     * else: iso-latin-1 code
102     * This allows for the whole unicode range to be encoded, including chars outside of
103     * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
104     * characters which should never happen anyway (and still work, but take 3 bytes).
105     *
106     * bigram address list is:
107     * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no     : FLAG_ATTRIBUTE_HAS_NEXT
108     *           | addressSign = 1 bit,                 : FLAG_ATTRIBUTE_OFFSET_NEGATIVE
109     *           |                      1 = must take -address, 0 = must take +address
110     *           |                         xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE
111     *           | addressFormat = 2 bits, 00 = unused  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
112     *           |                         01 = 1 byte  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
113     *           |                         10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES
114     *           |                         11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES
115     *           | 4 bits : frequency         : mask with FLAG_ATTRIBUTE_FREQUENCY
116     * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat)
117     *           |   read 1 byte, add top 4 bits
118     *           | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat)
119     *           |   read 2 bytes, add top 4 bits
120     *           | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat
121     *           |   read 3 bytes, add top 4 bits
122     *           | END
123     *           | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address
124     * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is
125     *
126     * shortcut string list is:
127     * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
128     * <flags>     = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT
129     *               | reserved = 3 bits, must be 0
130     *               | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY
131     * <shortcut>  = | string of characters at the char format described above, with the terminator
132     *               | used to signal the end of the string.
133     * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags
134     */
135
136    static final int VERSION_1_MAGIC_NUMBER = 0x78B1;
137    public static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
138    static final int MINIMUM_SUPPORTED_VERSION = 1;
139    static final int MAXIMUM_SUPPORTED_VERSION = 3;
140    static final int NOT_A_VERSION_NUMBER = -1;
141    static final int FIRST_VERSION_WITH_HEADER_SIZE = 2;
142    static final int FIRST_VERSION_WITH_PARENT_ADDRESS = 3;
143
144    // These options need to be the same numeric values as the one in the native reading code.
145    static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
146    static final int HAS_PARENT_ADDRESS = 0x2;
147    static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
148    static final int CONTAINS_BIGRAMS_FLAG = 0x8;
149
150    // TODO: Make this value adaptative to content data, store it in the header, and
151    // use it in the reading code.
152    static final int MAX_WORD_LENGTH = Constants.Dictionary.MAX_WORD_LENGTH;
153
154    static final int PARENT_ADDRESS_SIZE = 3;
155
156    static final int MASK_GROUP_ADDRESS_TYPE = 0xC0;
157    static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
158    static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
159    static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
160    static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
161
162    static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
163
164    static final int FLAG_IS_TERMINAL = 0x10;
165    static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
166    static final int FLAG_HAS_BIGRAMS = 0x04;
167    static final int FLAG_IS_NOT_A_WORD = 0x02;
168    static final int FLAG_IS_BLACKLISTED = 0x01;
169
170    static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
171    static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
172    static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
173    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
174    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
175    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
176    static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F;
177
178    static final int GROUP_CHARACTERS_TERMINATOR = 0x1F;
179
180    static final int GROUP_TERMINATOR_SIZE = 1;
181    static final int GROUP_FLAGS_SIZE = 1;
182    static final int GROUP_FREQUENCY_SIZE = 1;
183    static final int GROUP_MAX_ADDRESS_SIZE = 3;
184    static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1;
185    static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
186    static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2;
187
188    static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
189    static final int NO_PARENT_ADDRESS = 0;
190    static final int INVALID_CHARACTER = -1;
191
192    static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127
193    static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
194
195    static final int MAX_TERMINAL_FREQUENCY = 255;
196    static final int MAX_BIGRAM_FREQUENCY = 15;
197
198    /**
199     * Options about file format.
200     */
201    public static class FormatOptions {
202        public final int mVersion;
203        public final boolean mHasParentAddress;
204        public FormatOptions(final int version) {
205            this(version, false);
206        }
207        public FormatOptions(final int version, final boolean hasParentAddress) {
208            mVersion = version;
209            if (version < FormatSpec.FIRST_VERSION_WITH_PARENT_ADDRESS && hasParentAddress) {
210                throw new RuntimeException("Parent addresses are only supported with versions "
211                        + FormatSpec.FIRST_VERSION_WITH_PARENT_ADDRESS + " and ulterior.");
212            }
213            mHasParentAddress = hasParentAddress;
214        }
215    }
216
217    /**
218     * Class representing file header.
219     */
220    static final class FileHeader {
221        public final int mHeaderSize;
222        public final DictionaryOptions mDictionaryOptions;
223        public final FormatOptions mFormatOptions;
224        public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
225                final FormatOptions formatOptions) {
226            mHeaderSize = headerSize;
227            mDictionaryOptions = dictionaryOptions;
228            mFormatOptions = formatOptions;
229        }
230    }
231
232    private FormatSpec() {
233        // This utility class is not publicly instantiable.
234    }
235}
236