181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada/* 281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * Copyright (C) 2012 The Android Open Source Project 381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License"); 58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License. 68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at 781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * http://www.apache.org/licenses/LICENSE-2.0 981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 1081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * Unless required by applicable law or agreed to in writing, software 118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS, 128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and 148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License. 1581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada */ 1681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 1781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanadapackage com.android.inputmethod.latin.makedict; 1881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 197d1ae52ded0deca6b2674df0273ec852ad36319fYuichiro Hanadaimport com.android.inputmethod.annotations.UsedForTesting; 200f7d881dc72132dfd75c8b4fe61a69fc5cdcd460Mohammadinamul Sheikimport com.android.inputmethod.latin.define.DecoderSpecificConstants; 2181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 223ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagiimport java.util.Date; 233ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagiimport java.util.HashMap; 240e40cd0c40f2c731f91ccd0561e251262e5a2614Yuichiro Hanada 2581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada/** 2681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * Dictionary File Format Specification. 2781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada */ 2881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanadapublic final class FormatSpec { 2981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 3081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada /* 317ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * File header layout is as follows: 327ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * 337ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * v | 347ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * e | MAGIC_NUMBER + version of the file format, 2 bytes. 357ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * r | 367ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * sion 377ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * 387ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * o | 39c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * p | not used, 2 bytes. 407b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * o | 417b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * nflags 427ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * 437ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * h | 447ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * e | size of the file header, 4bytes 457ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * a | including the size of the magic number, the option flags and the header size 467ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * d | 477ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * ersize 487ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * 49c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * attributes list 507ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * 517ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * attributes list is: 527ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * <key> = | string of characters at the char format described below, with the terminator used 537ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * | to signal the end of the string. 547ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * <value> = | string of characters at the char format described below, with the terminator used 557ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * | to signal the end of the string. 567ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * if the size of already read < headersize, goto key. 577ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada * 587ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada */ 597ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada 607ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada /* 61af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Node array (FusionDictionary.PtNodeArray) layout is as follows: 6281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 63576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * n | 64576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * o | the number of PtNodes, 1 or 2 bytes. 65576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * d | 1 byte = bbbbbbbb match 66576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * e | case 1xxxxxxx => xxxxxxx << 8 + next byte 67576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * c | otherwise => bbbbbbbb 68576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * o | 69576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * unt 7081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 71576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * n | 72576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * o | sequence of PtNodes, 73576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * d | the layout of each PtNode is described below. 74576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * e | 75576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * s 7681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 77061d225fb1d110695b396a470d9ae6a9a3331003Yuichiro Hanada * f | 787b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * o | forward link address, 3byte 797b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * r | 1 byte = bbbbbbbb match 807b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * w | case 1xxxxxxx => -((xxxxxxx << 16) + (next byte << 8) + next byte) 817b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * a | otherwise => (xxxxxxx << 16) + (next byte << 8) + next byte 827b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * r | 837b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * dlinkaddress 8481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada */ 8581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 86576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada /* Node (FusionDictionary.PtNode) layout is as follows: 87c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * | CHILDREN_ADDRESS_TYPE 2 bits, 11 : FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES 88c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * | 10 : FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES 89c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * f | 01 : FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE 90c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * l | 00 : FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS 917b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * a | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS 927b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * g | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL 937b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * s | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS 9481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS 9581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD 9605172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu * | is possibly offensive ? 1 bit, 1 = yes, 0 = no : FLAG_IS_POSSIBLY_OFFENSIVE 9781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 9881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * c | IF FLAG_HAS_MULTIPLE_CHARS 99576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * h | char, char, char, char n * (1 or 3 bytes) : use PtNodeInfo for i/o helpers 10081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * a | end 1 byte, = 0 10181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * r | ELSE 10281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * s | char 1 or 3 bytes 10381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | END 10481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 10581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * f | 10681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * r | IF FLAG_IS_TERMINAL 10781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * e | frequency 1 byte 10881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * q | 10981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 1107b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard * c | 111c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * h | children address, CHILDREN_ADDRESS_TYPE bytes 112c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * i | This address is relative to the position of this field. 113c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * l | 114c7b6393535485a893bf8b0b0c7566df70a8f2d03Keisuke Kuroyanagi * drenaddress 11581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 11681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS 11781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | shortcut string list 11881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS 11981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | bigrams address list 12081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 12181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * Char format is: 12281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 1 byte = bbbbbbbb match 12381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 12481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 12581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 12681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 00011111 would be outside unicode. 12781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * else: iso-latin-1 code 12881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * This allows for the whole unicode range to be encoded, including chars outside of 12981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 13081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * characters which should never happen anyway (and still work, but take 3 bytes). 13181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 13281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * bigram address list is: 133576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT 134576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | addressSign = 1 bit, : FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE 13581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | 1 = must take -address, 0 = must take +address 136576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | xx : mask with MASK_BIGRAM_ATTR_ADDRESS_TYPE 137576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | addressFormat = 2 bits, 00 = unused : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE 138576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | 01 = 1 byte : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE 139576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | 10 = 2 bytes : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES 140576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | 11 = 3 bytes : FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES 141576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | 4 bits : frequency : mask with FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY 142576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * <address> | IF (01 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE == addressFormat) 14381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | read 1 byte, add top 4 bits 144576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | ELSIF (10 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES == addressFormat) 14581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | read 2 bytes, add top 4 bits 146576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | ELSE // 11 == FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES == addressFormat 14781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | read 3 bytes, add top 4 bits 14881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | END 149576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | if (FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE) then address = -address 150576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * if (FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT) goto bigram_and_shortcut_address_list_is 15181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * 15281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * shortcut string list is: 153576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * <byte size> = PTNODE_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes. 154576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT 15581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | reserved = 3 bits, must be 0 156576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * | 4 bits : frequency : mask with FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY 15781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * <shortcut> = | string of characters at the char format described above, with the terminator 15881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada * | used to signal the end of the string. 159576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * if (FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT goto flags 16081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada */ 16181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 1627ec9db2c34ee6bec2cbff6cf05cee9bf3c2f7122Yuichiro Hanada public static final int MAGIC_NUMBER = 0x9BC13AFE; 16381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int NOT_A_VERSION_NUMBER = -1; 1642fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 1652fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // These MUST have the same values as the relevant constants in format_utils.h. 1669168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto // From version 2.01 on, we use version * 100 + revision as a version number. That allows 1672fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // us to change the format during development while having testing devices remove 1682fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // older files with each upgrade, while still having a readable versioning scheme. 169455dc84cf2c6526329b535f30000ea45b7d4d4d7Keisuke Kuroyanagi // When we bump up the dictionary format version, we should update 170455dc84cf2c6526329b535f30000ea45b7d4d4d7Keisuke Kuroyanagi // ExpandableDictionary.needsToMigrateDictionary() and 171455dc84cf2c6526329b535f30000ea45b7d4d4d7Keisuke Kuroyanagi // ExpandableDictionary.matchesExpectedBinaryDictFormatVersionForThisType(). 1722fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public static final int VERSION2 = 2; 1739168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto public static final int VERSION201 = 201; 1747c87859d4c16c9cf19b095b865d7000ebc3cdaa9Adrian Velicu public static final int VERSION202 = 202; 175377ba98b753c63a5d25d2e139533191aa18e4c0dMohammadinamul Sheik // format version for Fava Dictionaries. 176377ba98b753c63a5d25d2e139533191aa18e4c0dMohammadinamul Sheik public static final int VERSION_DELIGHT3 = 86736212; 177ea468cc9de468b6574f98b3a7614decfcb7e456eKeisuke Kuroyanagi public static final int VERSION402 = 402; 178ea468cc9de468b6574f98b3a7614decfcb7e456eKeisuke Kuroyanagi public static final int VERSION403 = 403; 179ea468cc9de468b6574f98b3a7614decfcb7e456eKeisuke Kuroyanagi public static final int VERSION4 = VERSION403; 180de29278592a15fbd61c4ab5052d45e1b137c7e9bAdrian Velicu public static final int MINIMUM_SUPPORTED_STATIC_VERSION = VERSION202; 181377ba98b753c63a5d25d2e139533191aa18e4c0dMohammadinamul Sheik public static final int MAXIMUM_SUPPORTED_STATIC_VERSION = VERSION_DELIGHT3; 18288480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu static final int MINIMUM_SUPPORTED_DYNAMIC_VERSION = VERSION4; 18352dafe8c321663338ba458f7ad1a08c0c838f156Dan Zivkovic static final int MAXIMUM_SUPPORTED_DYNAMIC_VERSION = VERSION403; 18481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 18581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada // TODO: Make this value adaptative to content data, store it in the header, and 18681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada // use it in the reading code. 1870f7d881dc72132dfd75c8b4fe61a69fc5cdcd460Mohammadinamul Sheik static final int MAX_WORD_LENGTH = DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; 18881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 1894ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada // These flags are used only in the static dictionary. 190576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int MASK_CHILDREN_ADDRESS_TYPE = 0xC0; 191576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS = 0x00; 192576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE = 0x40; 193576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES = 0x80; 194576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES = 0xC0; 19581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 19681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; 19781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 19881d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int FLAG_IS_TERMINAL = 0x10; 19981d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; 20081d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int FLAG_HAS_BIGRAMS = 0x04; 20181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int FLAG_IS_NOT_A_WORD = 0x02; 20205172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu static final int FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; 2034ad4ff618f5102148d73e3c04d809942bcf16f86Yuichiro Hanada 204576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT = 0x80; 205576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE = 0x40; 206576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int MASK_BIGRAM_ATTR_ADDRESS_TYPE = 0x30; 207576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE = 0x10; 208576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES = 0x20; 209576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES = 0x30; 210576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY = 0x0F; 21181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 212576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int PTNODE_CHARACTERS_TERMINATOR = 0x1F; 21381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 214576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int PTNODE_TERMINATOR_SIZE = 1; 215576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int PTNODE_FLAGS_SIZE = 1; 216576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int PTNODE_FREQUENCY_SIZE = 1; 217576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int PTNODE_MAX_ADDRESS_SIZE = 3; 218576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int PTNODE_ATTRIBUTE_FLAGS_SIZE = 1; 219576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; 220576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int PTNODE_SHORTCUT_LIST_SIZE_SIZE = 2; 22181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 22281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; 22381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int INVALID_CHARACTER = -1; 22481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 225576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT = 0x7F; // 127 226db4f3730047c8a3e25e031aacc07bb02bc47c5aeKeisuke Kuroyanagi // Large PtNode array size field size is 2 bytes. 227db4f3730047c8a3e25e031aacc07bb02bc47c5aeKeisuke Kuroyanagi static final int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; 228576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int MAX_PTNODES_IN_A_PT_NODE_ARRAY = 0x7FFF; // 32767 229576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static final int MAX_BIGRAMS_IN_A_PTNODE = 10000; 230a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada static final int MAX_SHORTCUT_LIST_SIZE_IN_A_PTNODE = 0xFFFF; 23181d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 23281d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int MAX_TERMINAL_FREQUENCY = 255; 23381d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada static final int MAX_BIGRAM_FREQUENCY = 15; 23481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada 23547cac57e4593f47e753410e4199e84e458d6de6fJean Chalard public static final int SHORTCUT_WHITELIST_FREQUENCY = 15; 23647cac57e4593f47e753410e4199e84e458d6de6fJean Chalard 237d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada // This option needs to be the same numeric value as the one in binary_format.h. 238d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada static final int NOT_VALID_WORD = -99; 239d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada 240665592774c1d5ec90abbe772d4303fe8d8fe8089Yuichiro Hanada static final int UINT8_MAX = 0xFF; 241665592774c1d5ec90abbe772d4303fe8d8fe8089Yuichiro Hanada static final int UINT16_MAX = 0xFFFF; 242665592774c1d5ec90abbe772d4303fe8d8fe8089Yuichiro Hanada static final int UINT24_MAX = 0xFFFFFF; 243665592774c1d5ec90abbe772d4303fe8d8fe8089Yuichiro Hanada static final int MSB8 = 0x80; 2448a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 2458a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; 246665592774c1d5ec90abbe772d4303fe8d8fe8089Yuichiro Hanada 2471a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada /** 2481a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada * Options about file format. 2491a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada */ 250a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka public static final class FormatOptions { 2511a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada public final int mVersion; 2529514ed5c2a49e645e2d468f7191d54d77d9f127fYuichiro Hanada public final boolean mHasTimestamp; 2537d1ae52ded0deca6b2674df0273ec852ad36319fYuichiro Hanada 2547d1ae52ded0deca6b2674df0273ec852ad36319fYuichiro Hanada @UsedForTesting 2557b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard public FormatOptions(final int version) { 2567b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard this(version, false /* hasTimestamp */); 2579514ed5c2a49e645e2d468f7191d54d77d9f127fYuichiro Hanada } 2589514ed5c2a49e645e2d468f7191d54d77d9f127fYuichiro Hanada 2597b55cd3e2b4966150fa4c44dd43ebfeb77058a43Jean Chalard public FormatOptions(final int version, final boolean hasTimestamp) { 2601a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada mVersion = version; 2619514ed5c2a49e645e2d468f7191d54d77d9f127fYuichiro Hanada mHasTimestamp = hasTimestamp; 2621a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada } 2631a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada } 2641a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada 2651a347723c5ad4a71076df67f3af3b702db205719Yuichiro Hanada /** 2663ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi * Options global to the dictionary. 2673ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi */ 2683ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi public static final class DictionaryOptions { 2693ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi public final HashMap<String, String> mAttributes; 2703ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi public DictionaryOptions(final HashMap<String, String> attributes) { 2713ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi mAttributes = attributes; 2723ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } 2733ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi @Override 2743ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi public String toString() { // Convenience method 2753ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi return toString(0, false); 2763ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } 2773ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi public String toString(final int indentCount, final boolean plumbing) { 2783ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi final StringBuilder indent = new StringBuilder(); 2793ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi if (plumbing) { 2803ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi indent.append("H:"); 2813ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } else { 2823ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi for (int i = 0; i < indentCount; ++i) { 2833ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi indent.append(" "); 2843ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } 2853ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } 2863ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi final StringBuilder s = new StringBuilder(); 2873ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi for (final String optionKey : mAttributes.keySet()) { 2883ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi s.append(indent); 2893ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi s.append(optionKey); 2903ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi s.append(" = "); 2913ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi if ("date".equals(optionKey) && !plumbing) { 2923ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi // Date needs a number of milliseconds, but the dictionary contains seconds 2933ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi s.append(new Date( 2943ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi 1000 * Long.parseLong(mAttributes.get(optionKey))).toString()); 2953ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } else { 2963ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi s.append(mAttributes.get(optionKey)); 2973ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } 2983ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi s.append("\n"); 2993ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } 3003ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi return s.toString(); 3013ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } 3023ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi } 3033ad4af2354e7003ac288dafe3600268fe860d752Keisuke Kuroyanagi 30481d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada private FormatSpec() { 30581d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada // This utility class is not publicly instantiable. 30681d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada } 30781d97eec0e77e72cce606f9c9f96091c0b348190Yuichiro Hanada} 308