1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.latin.Constants;
20import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
21
22/**
23 * Dictionary File Format Specification.
24 */
25public final class FormatSpec {
26
27    /*
28     * Array of Node(FusionDictionary.Node) layout is as follows:
29     *
30     * g |
31     * r | the number of groups, 1 or 2 bytes.
32     * o | 1 byte = bbbbbbbb match
33     * u |   case 1xxxxxxx => xxxxxxx << 8 + next byte
34     * p |   otherwise => bbbbbbbb
35     * c |
36     * ount
37     *
38     * g |
39     * r | sequence of groups,
40     * o | the layout of each group is described below.
41     * u |
42     * ps
43     *
44     * f |
45     * o | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
46     * r |     forward link address, 3byte
47     * w | 1 byte = bbbbbbbb match
48     * a |   case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte)
49     * r |   otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte
50     * d |
51     * linkaddress
52     */
53
54    /* Node(CharGroup) layout is as follows:
55     *   | IF !SUPPORTS_DYNAMIC_UPDATE
56     *   |   addressType                         xx     : mask with MASK_GROUP_ADDRESS_TYPE
57     *   |                           2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
58     * f |                                   01 = 1 byte      : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
59     * l |                                   10 = 2 bytes     : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
60     * a |                                   11 = 3 bytes     : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
61     * g | ELSE
62     * s |   is moved ?              2 bits, 11 = no
63     *   |                                   01 = yes
64     *   |                        the new address is stored in the same place as the parent address
65     *   | has several chars ?         1 bit, 1 = yes, 0 = no   : FLAG_HAS_MULTIPLE_CHARS
66     *   | has a terminal ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_TERMINAL
67     *   | has shortcut targets ?      1 bit, 1 = yes, 0 = no   : FLAG_HAS_SHORTCUT_TARGETS
68     *   | has bigrams ?               1 bit, 1 = yes, 0 = no   : FLAG_HAS_BIGRAMS
69     *   | is not a word ?             1 bit, 1 = yes, 0 = no   : FLAG_IS_NOT_A_WORD
70     *   | is blacklisted ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_BLACKLISTED
71     *
72     * p |
73     * a | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
74     * r |     parent address, 3byte
75     * e | 1 byte = bbbbbbbb match
76     * n |   case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
77     * t |   otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte
78     * a |
79     * ddress
80     *
81     * c | IF FLAG_HAS_MULTIPLE_CHARS
82     * h |   char, char, char, char    n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
83     * a |   end                       1 byte, = 0
84     * r | ELSE
85     * s |   char                      1 or 3 bytes
86     *   | END
87     *
88     * f |
89     * r | IF FLAG_IS_TERMINAL
90     * e |   frequency                 1 byte
91     * q |
92     *
93     * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType
94     * h |   // nothing
95     * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType
96     * l |   children address, 1 byte
97     * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType
98     * r |   children address, 2 bytes
99     * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType
100     * n |   children address, 3 bytes
101     * A | END
102     * d
103     * dress
104     *
105     *   | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
106     *   | shortcut string list
107     *   | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS
108     *   | bigrams address list
109     *
110     * Char format is:
111     * 1 byte = bbbbbbbb match
112     * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
113     * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
114     *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
115     *       00011111 would be outside unicode.
116     * else: iso-latin-1 code
117     * This allows for the whole unicode range to be encoded, including chars outside of
118     * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
119     * characters which should never happen anyway (and still work, but take 3 bytes).
120     *
121     * bigram address list is:
122     * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no     : FLAG_ATTRIBUTE_HAS_NEXT
123     *           | addressSign = 1 bit,                 : FLAG_ATTRIBUTE_OFFSET_NEGATIVE
124     *           |                      1 = must take -address, 0 = must take +address
125     *           |                         xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE
126     *           | addressFormat = 2 bits, 00 = unused  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
127     *           |                         01 = 1 byte  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
128     *           |                         10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES
129     *           |                         11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES
130     *           | 4 bits : frequency         : mask with FLAG_ATTRIBUTE_FREQUENCY
131     * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat)
132     *           |   read 1 byte, add top 4 bits
133     *           | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat)
134     *           |   read 2 bytes, add top 4 bits
135     *           | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat
136     *           |   read 3 bytes, add top 4 bits
137     *           | END
138     *           | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address
139     * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is
140     *
141     * shortcut string list is:
142     * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
143     * <flags>     = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT
144     *               | reserved = 3 bits, must be 0
145     *               | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY
146     * <shortcut>  = | string of characters at the char format described above, with the terminator
147     *               | used to signal the end of the string.
148     * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags
149     */
150
151    static final int VERSION_1_MAGIC_NUMBER = 0x78B1;
152    public static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
153    static final int MINIMUM_SUPPORTED_VERSION = 1;
154    static final int MAXIMUM_SUPPORTED_VERSION = 3;
155    static final int NOT_A_VERSION_NUMBER = -1;
156    static final int FIRST_VERSION_WITH_HEADER_SIZE = 2;
157    static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3;
158
159    // These options need to be the same numeric values as the one in the native reading code.
160    static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
161    // TODO: Make the native reading code read this variable.
162    static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
163    static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
164    static final int CONTAINS_BIGRAMS_FLAG = 0x8;
165
166    // TODO: Make this value adaptative to content data, store it in the header, and
167    // use it in the reading code.
168    static final int MAX_WORD_LENGTH = Constants.Dictionary.MAX_WORD_LENGTH;
169
170    static final int PARENT_ADDRESS_SIZE = 3;
171    static final int FORWARD_LINK_ADDRESS_SIZE = 3;
172
173    static final int MASK_GROUP_ADDRESS_TYPE = 0xC0;
174    static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
175    static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
176    static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
177    static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
178
179    static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
180
181    static final int FLAG_IS_TERMINAL = 0x10;
182    static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
183    static final int FLAG_HAS_BIGRAMS = 0x04;
184    static final int FLAG_IS_NOT_A_WORD = 0x02;
185    static final int FLAG_IS_BLACKLISTED = 0x01;
186    static final int FLAG_IS_MOVED = 0x40;
187
188    static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
189    static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
190    static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
191    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
192    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
193    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
194    static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F;
195
196    static final int GROUP_CHARACTERS_TERMINATOR = 0x1F;
197
198    static final int GROUP_TERMINATOR_SIZE = 1;
199    static final int GROUP_FLAGS_SIZE = 1;
200    static final int GROUP_FREQUENCY_SIZE = 1;
201    static final int GROUP_MAX_ADDRESS_SIZE = 3;
202    static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1;
203    static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
204    static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2;
205
206    static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
207    static final int NO_PARENT_ADDRESS = 0;
208    static final int NO_FORWARD_LINK_ADDRESS = 0;
209    static final int INVALID_CHARACTER = -1;
210
211    static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127
212    static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
213
214    static final int MAX_TERMINAL_FREQUENCY = 255;
215    static final int MAX_BIGRAM_FREQUENCY = 15;
216
217    // This option needs to be the same numeric value as the one in binary_format.h.
218    static final int NOT_VALID_WORD = -99;
219    static final int SIGNED_CHILDREN_ADDRESS_SIZE = 3;
220
221    /**
222     * Options about file format.
223     */
224    public static final class FormatOptions {
225        public final int mVersion;
226        public final boolean mSupportsDynamicUpdate;
227        public FormatOptions(final int version) {
228            this(version, false);
229        }
230        public FormatOptions(final int version, final boolean supportsDynamicUpdate) {
231            mVersion = version;
232            if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) {
233                throw new RuntimeException("Dynamic updates are only supported with versions "
234                        + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior.");
235            }
236            mSupportsDynamicUpdate = supportsDynamicUpdate;
237        }
238    }
239
240    /**
241     * Class representing file header.
242     */
243    static final class FileHeader {
244        public final int mHeaderSize;
245        public final DictionaryOptions mDictionaryOptions;
246        public final FormatOptions mFormatOptions;
247        public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
248                final FormatOptions formatOptions) {
249            mHeaderSize = headerSize;
250            mDictionaryOptions = dictionaryOptions;
251            mFormatOptions = formatOptions;
252        }
253    }
254
255    private FormatSpec() {
256        // This utility class is not publicly instantiable.
257    }
258}
259