FormatSpec.java revision 7ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122
181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada/*
281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * Copyright (C) 2012 The Android Open Source Project
381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada *
48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License");
58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License.
68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at
781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada *
88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka *      http://www.apache.org/licenses/LICENSE-2.0
981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada *
1081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * Unless required by applicable law or agreed to in writing, software
118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS,
128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and
148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License.
1581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada */
1681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
1781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanadapackage com.android.inputmethod.latin.makedict;
1881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
197d1ae52ded0deca6b2674df0273ec852ad36319fYuichiro Hanadaimport com.android.inputmethod.annotations.UsedForTesting;
2081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanadaimport com.android.inputmethod.latin.Constants;
211a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanadaimport com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
2281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
2381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada/**
2481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * Dictionary File Format Specification.
2581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada */
2681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanadapublic final class FormatSpec {
2781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
2881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    /*
297ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * File header layout is as follows:
307ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *
317ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * v |
327ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * e | MAGIC_NUMBER + version of the file format, 2 bytes.
337ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * r |
347ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * sion
357ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *
367ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * o |
377ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * p | not used                                4 bits
387ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * t | has bigrams ?                           1 bit, 1 = yes, 0 = no : CONTAINS_BIGRAMS_FLAG
397ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * i | FRENCH_LIGATURE_PROCESSING_FLAG
407ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * o | supports dynamic updates ?              1 bit, 1 = yes, 0 = no : SUPPORTS_DYNAMIC_UPDATE
417ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * n | GERMAN_UMLAUT_PROCESSING_FLAG
427ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * f |
437ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * lags
447ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *
457ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * h |
467ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * e | size of the file header, 4bytes
477ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * a |   including the size of the magic number, the option flags and the header size
487ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * d |
497ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * ersize
507ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *
517ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *   | attributes list
527ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *
537ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * attributes list is:
547ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * <key>   = | string of characters at the char format described below, with the terminator used
557ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *           | to signal the end of the string.
567ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * <value> = | string of characters at the char format described below, with the terminator used
577ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *           | to signal the end of the string.
587ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     * if the size of already read < headersize, goto key.
597ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     *
607ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada     */
617ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada
627ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada    /*
6381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * Array of Node(FusionDictionary.Node) layout is as follows:
6481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
6581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * g |
6681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * r | the number of groups, 1 or 2 bytes.
6781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * o | 1 byte = bbbbbbbb match
6881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * u |   case 1xxxxxxx => xxxxxxx << 8 + next byte
6981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * p |   otherwise => bbbbbbbb
7081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * c |
7181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * ount
7281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
7381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * g |
7481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * r | sequence of groups,
7581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * o | the layout of each group is described below.
7681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * u |
7781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * ps
7881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
79061d225fb1d110695b396a470d9ae6a9a3331003Yuichiro Hanada     * f |
808ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * o | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
81061d225fb1d110695b396a470d9ae6a9a3331003Yuichiro Hanada     * r |     forward link address, 3byte
828ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * w | 1 byte = bbbbbbbb match
838ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * a |   case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte)
848ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * r |   otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte
858ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * d |
868ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * linkaddress
8781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     */
8881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
8981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    /* Node(CharGroup) layout is as follows:
902ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     *   | IF !SUPPORTS_DYNAMIC_UPDATE
912ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     *   |   addressType                         xx     : mask with MASK_GROUP_ADDRESS_TYPE
922ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     *   |                           2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
932ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     * f |                                   01 = 1 byte      : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
942ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     * l |                                   10 = 2 bytes     : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
952ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     * a |                                   11 = 3 bytes     : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
962ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     * g | ELSE
974ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada     * s |   is moved ?              2 bits, 11 = no          : FLAG_IS_NOT_MOVED
984ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada     *   |                              This must be the same as FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
994ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada     *   |                                   01 = yes         : FLAG_IS_MOVED
1002ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     *   |                        the new address is stored in the same place as the parent address
101a853356b82e2dc74962243e3143c0ff7a33f3c20Yuichiro Hanada     *   |   is deleted?                     10 = yes         : FLAG_IS_DELETED
1022ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     *   | has several chars ?         1 bit, 1 = yes, 0 = no   : FLAG_HAS_MULTIPLE_CHARS
1032ee70804e92b17016a2f042c4f6b0e94b5d23e88Yuichiro Hanada     *   | has a terminal ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_TERMINAL
10481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | has shortcut targets ?      1 bit, 1 = yes, 0 = no   : FLAG_HAS_SHORTCUT_TARGETS
10581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | has bigrams ?               1 bit, 1 = yes, 0 = no   : FLAG_HAS_BIGRAMS
10681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | is not a word ?             1 bit, 1 = yes, 0 = no   : FLAG_IS_NOT_A_WORD
10781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | is blacklisted ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_BLACKLISTED
10881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
10981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * p |
1108ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * a | IF SUPPORTS_DYNAMIC_UPDATE (defined in the file header)
11181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * r |     parent address, 3byte
1128ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * e | 1 byte = bbbbbbbb match
1138ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * n |   case 1xxxxxxx => -((0xxxxxxx << 16) + (next byte << 8) + next byte)
1148ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * t |   otherwise => (bbbbbbbb << 16) + (next byte << 8) + next byte
1158ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * a |
1168ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada     * ddress
11781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
11881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * c | IF FLAG_HAS_MULTIPLE_CHARS
11981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * h |   char, char, char, char    n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
12081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * a |   end                       1 byte, = 0
12181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * r | ELSE
12281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * s |   char                      1 or 3 bytes
12381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | END
12481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
12581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * f |
12681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * r | IF FLAG_IS_TERMINAL
12781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * e |   frequency                 1 byte
12881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * q |
12981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
13081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType
13181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * h |   // nothing
13281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType
13381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * l |   children address, 1 byte
13481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType
13581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * r |   children address, 2 bytes
13681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType
13781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * n |   children address, 3 bytes
13881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * A | END
13981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * d
14081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * dress
14181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
14281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
14381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | shortcut string list
14481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS
14581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *   | bigrams address list
14681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
14781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * Char format is:
14881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * 1 byte = bbbbbbbb match
14981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
15081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
15181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
15281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *       00011111 would be outside unicode.
15381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * else: iso-latin-1 code
15481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * This allows for the whole unicode range to be encoded, including chars outside of
15581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
15681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * characters which should never happen anyway (and still work, but take 3 bytes).
15781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
15881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * bigram address list is:
15981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no     : FLAG_ATTRIBUTE_HAS_NEXT
16081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           | addressSign = 1 bit,                 : FLAG_ATTRIBUTE_OFFSET_NEGATIVE
16181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           |                      1 = must take -address, 0 = must take +address
16281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           |                         xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE
16381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           | addressFormat = 2 bits, 00 = unused  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
16481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           |                         01 = 1 byte  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
16581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           |                         10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES
16681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           |                         11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES
16781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           | 4 bits : frequency         : mask with FLAG_ATTRIBUTE_FREQUENCY
16881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat)
16981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           |   read 1 byte, add top 4 bits
17081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat)
17181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           |   read 2 bytes, add top 4 bits
17281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat
17381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           |   read 3 bytes, add top 4 bits
17481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           | END
17581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *           | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address
17681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is
17781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *
17881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * shortcut string list is:
17981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
18081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * <flags>     = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT
18181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *               | reserved = 3 bits, must be 0
18281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *               | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY
18381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * <shortcut>  = | string of characters at the char format described above, with the terminator
18481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     *               | used to signal the end of the string.
18581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags
18681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada     */
18781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
1887ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada    public static final int MAGIC_NUMBER = 0x9BC13AFE;
1897ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada    static final int MINIMUM_SUPPORTED_VERSION = 2;
19081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int MAXIMUM_SUPPORTED_VERSION = 3;
19181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int NOT_A_VERSION_NUMBER = -1;
19282d9deaaf252cd20f8918adbc7a4b9b8f2647c38Yuichiro Hanada    static final int FIRST_VERSION_WITH_DYNAMIC_UPDATE = 3;
19381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
19481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    // These options need to be the same numeric values as the one in the native reading code.
19581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
196b686df15fcc95611c524318359fe9ecb4fd6f74cYuichiro Hanada    // TODO: Make the native reading code read this variable.
19782d9deaaf252cd20f8918adbc7a4b9b8f2647c38Yuichiro Hanada    static final int SUPPORTS_DYNAMIC_UPDATE = 0x2;
19881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
19981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int CONTAINS_BIGRAMS_FLAG = 0x8;
20081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
20181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    // TODO: Make this value adaptative to content data, store it in the header, and
20281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    // use it in the reading code.
203ffcbbaf12788a9fc9398607a548e552d7d2bf05eSatoshi Kataoka    static final int MAX_WORD_LENGTH = Constants.DICTIONARY_MAX_WORD_LENGTH;
20481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
20581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int PARENT_ADDRESS_SIZE = 3;
206061d225fb1d110695b396a470d9ae6a9a3331003Yuichiro Hanada    static final int FORWARD_LINK_ADDRESS_SIZE = 3;
20781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
2084ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada    // These flags are used only in the static dictionary.
20981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int MASK_GROUP_ADDRESS_TYPE = 0xC0;
21081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
21181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
21281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
21381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
21481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
21581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
21681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
21781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_IS_TERMINAL = 0x10;
21881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
21981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_HAS_BIGRAMS = 0x04;
22081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_IS_NOT_A_WORD = 0x02;
22181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_IS_BLACKLISTED = 0x01;
2224ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada
2234ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada    // These flags are used only in the dynamic dictionary.
224c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada    static final int MASK_MOVE_AND_DELETE_FLAG = 0xC0;
2254ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada    static final int FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE = 0x40;
2264ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada    static final int FLAG_IS_MOVED = 0x00 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE;
2274ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada    static final int FLAG_IS_NOT_MOVED = 0x80 | FIXED_BIT_OF_DYNAMIC_UPDATE_MOVE;
228a853356b82e2dc74962243e3143c0ff7a33f3c20Yuichiro Hanada    static final int FLAG_IS_DELETED = 0x80;
22981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
23081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
23181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
23281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
23381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
23481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
23581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
23681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F;
23781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
23881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GROUP_CHARACTERS_TERMINATOR = 0x1F;
23981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
24081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GROUP_TERMINATOR_SIZE = 1;
24181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GROUP_FLAGS_SIZE = 1;
24281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GROUP_FREQUENCY_SIZE = 1;
24381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GROUP_MAX_ADDRESS_SIZE = 3;
24481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1;
24581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
24681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2;
24781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
24881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
24981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int NO_PARENT_ADDRESS = 0;
250061d225fb1d110695b396a470d9ae6a9a3331003Yuichiro Hanada    static final int NO_FORWARD_LINK_ADDRESS = 0;
25181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int INVALID_CHARACTER = -1;
25281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
25381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127
25481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
2557223cc2ef1d7fd4ad4ab62166114b36ce7313c55Yuichiro Hanada    static final int MAX_BIGRAMS_IN_A_GROUP = 10000;
25681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
25781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int MAX_TERMINAL_FREQUENCY = 255;
25881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    static final int MAX_BIGRAM_FREQUENCY = 15;
25981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada
26047cac57e4593f47e753410e4199e84e458d6de6fJean Chalard    public static final int SHORTCUT_WHITELIST_FREQUENCY = 15;
26147cac57e4593f47e753410e4199e84e458d6de6fJean Chalard
262d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada    // This option needs to be the same numeric value as the one in binary_format.h.
263d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada    static final int NOT_VALID_WORD = -99;
2648ec0064c49e80945dbe1bb31129eb890478b7e06Yuichiro Hanada    static final int SIGNED_CHILDREN_ADDRESS_SIZE = 3;
265d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada
2661a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada    /**
2671a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada     * Options about file format.
2681a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada     */
269a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka    public static final class FormatOptions {
2701a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        public final int mVersion;
27182d9deaaf252cd20f8918adbc7a4b9b8f2647c38Yuichiro Hanada        public final boolean mSupportsDynamicUpdate;
2727d1ae52ded0deca6b2674df0273ec852ad36319fYuichiro Hanada        @UsedForTesting
2731a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        public FormatOptions(final int version) {
2741a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada            this(version, false);
2751a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        }
2767d1ae52ded0deca6b2674df0273ec852ad36319fYuichiro Hanada
2777d1ae52ded0deca6b2674df0273ec852ad36319fYuichiro Hanada        @UsedForTesting
27882d9deaaf252cd20f8918adbc7a4b9b8f2647c38Yuichiro Hanada        public FormatOptions(final int version, final boolean supportsDynamicUpdate) {
2791a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada            mVersion = version;
28082d9deaaf252cd20f8918adbc7a4b9b8f2647c38Yuichiro Hanada            if (version < FIRST_VERSION_WITH_DYNAMIC_UPDATE && supportsDynamicUpdate) {
28182d9deaaf252cd20f8918adbc7a4b9b8f2647c38Yuichiro Hanada                throw new RuntimeException("Dynamic updates are only supported with versions "
28282d9deaaf252cd20f8918adbc7a4b9b8f2647c38Yuichiro Hanada                        + FIRST_VERSION_WITH_DYNAMIC_UPDATE + " and ulterior.");
283061d225fb1d110695b396a470d9ae6a9a3331003Yuichiro Hanada            }
28482d9deaaf252cd20f8918adbc7a4b9b8f2647c38Yuichiro Hanada            mSupportsDynamicUpdate = supportsDynamicUpdate;
2851a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        }
2861a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada    }
2871a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada
2881a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada    /**
2891a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada     * Class representing file header.
2901a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada     */
291fbc5e9b33469f989a745003c8e242c3f002ed165Jean Chalard    public static final class FileHeader {
2921a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        public final int mHeaderSize;
2931a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        public final DictionaryOptions mDictionaryOptions;
2941a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        public final FormatOptions mFormatOptions;
295af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard        private static final String DICTIONARY_VERSION_ATTRIBUTE = "version";
296af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard        private static final String DICTIONARY_LOCALE_ATTRIBUTE = "locale";
2972521edec09373b2810093462c89221a2aca9e369Jean Chalard        private static final String DICTIONARY_ID_ATTRIBUTE = "dictionary";
298c6799ffeab17d3e0dc54a1718dad9890e5493ae0Jean Chalard        private static final String DICTIONARY_DESCRIPTION_ATTRIBUTE = "description";
2991a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions,
3001a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada                final FormatOptions formatOptions) {
3011a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada            mHeaderSize = headerSize;
3021a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada            mDictionaryOptions = dictionaryOptions;
3031a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada            mFormatOptions = formatOptions;
3041a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada        }
305af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard
306af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard        // Helper method to get the locale as a String
307af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard        public String getLocaleString() {
308af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard            return mDictionaryOptions.mAttributes.get(FileHeader.DICTIONARY_LOCALE_ATTRIBUTE);
309af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard        }
310af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard
311af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard        // Helper method to get the version String
312af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard        public String getVersion() {
313af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard            return mDictionaryOptions.mAttributes.get(FileHeader.DICTIONARY_VERSION_ATTRIBUTE);
314af4a7e8c4b2a41e9be48965133ab489cc9484764Jean Chalard        }
3152521edec09373b2810093462c89221a2aca9e369Jean Chalard
3162521edec09373b2810093462c89221a2aca9e369Jean Chalard        // Helper method to get the dictionary ID as a String
3172521edec09373b2810093462c89221a2aca9e369Jean Chalard        public String getId() {
3182521edec09373b2810093462c89221a2aca9e369Jean Chalard            return mDictionaryOptions.mAttributes.get(FileHeader.DICTIONARY_ID_ATTRIBUTE);
3192521edec09373b2810093462c89221a2aca9e369Jean Chalard        }
320c6799ffeab17d3e0dc54a1718dad9890e5493ae0Jean Chalard
321c6799ffeab17d3e0dc54a1718dad9890e5493ae0Jean Chalard        // Helper method to get the description
322c6799ffeab17d3e0dc54a1718dad9890e5493ae0Jean Chalard        public String getDescription() {
323c6799ffeab17d3e0dc54a1718dad9890e5493ae0Jean Chalard            // TODO: Right now each dictionary file comes with a description in its own language.
324c6799ffeab17d3e0dc54a1718dad9890e5493ae0Jean Chalard            // It will display as is no matter the device's locale. It should be internationalized.
325c6799ffeab17d3e0dc54a1718dad9890e5493ae0Jean Chalard            return mDictionaryOptions.mAttributes.get(FileHeader.DICTIONARY_DESCRIPTION_ATTRIBUTE);
326c6799ffeab17d3e0dc54a1718dad9890e5493ae0Jean Chalard        }
3271a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada    }
3281a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada
32981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    private FormatSpec() {
33081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada        // This utility class is not publicly instantiable.
33181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada    }
33281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada}
333