194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada/* 294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * Copyright (C) 2013 The Android Open Source Project 394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * Licensed under the Apache License, Version 2.0 (the "License"); 594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * you may not use this file except in compliance with the License. 694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * You may obtain a copy of the License at 794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * http://www.apache.org/licenses/LICENSE-2.0 994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 1094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * Unless required by applicable law or agreed to in writing, software 1194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * distributed under the License is distributed on an "AS IS" BASIS, 1294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * See the License for the specific language governing permissions and 1494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * limitations under the License. 1594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 1694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 1794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanadapackage com.android.inputmethod.latin.makedict; 1894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 1977bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanadaimport com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; 2094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanadaimport com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 21576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanadaimport com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; 22af30cbf0ee8370763edf22822ea34a282e882084Jean Chalardimport com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 2394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 2494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanadaimport java.io.ByteArrayOutputStream; 2594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanadaimport java.io.IOException; 2694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanadaimport java.io.OutputStream; 2794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanadaimport java.util.ArrayList; 288a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimotoimport java.util.HashMap; 298a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimotoimport java.util.Map.Entry; 3094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 3194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada/** 3294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * Encodes binary files for a FusionDictionary. 3394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 3494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * All the methods in this class are static. 35576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * 36576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * TODO: Rename this class to DictEncoderUtils. 3794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 38a306e087536ea82c97deb4a022730e2cdf5d2c35Yuichiro Hanadapublic class BinaryDictEncoderUtils { 3994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 4094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada private static final boolean DBG = MakedictLog.DBG; 4194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 42a306e087536ea82c97deb4a022730e2cdf5d2c35Yuichiro Hanada private BinaryDictEncoderUtils() { 4394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // This utility class is not publicly instantiable. 4494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 4594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 4694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Arbitrary limit to how much passes we consider address size compression should 4794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // terminate in. At the time of this writing, our largest dictionary completes 4894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // compression in five passes. 4994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // If the number of passes exceeds this number, makedict bails with an exception on 5094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // suspicion that a bug might be causing an infinite loop. 5194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada private static final int MAX_PASSES = 24; 5294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 5394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 5494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * Compute the binary size of the character array. 5594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 5694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * If only one character, this is the size of this character. If many, it's the sum of their 5794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * sizes + 1 byte for the terminator. 5894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 5994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param characters the character array 6094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @return the size of the char array, including the terminator if any 6194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 629168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto static int getPtNodeCharactersSize(final int[] characters, 639168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 649168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto int size = CharEncoding.getCharArraySize(characters, codePointToOneByteCodeMap); 65576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE; 6694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return size; 6794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 6894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 6994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 70576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Compute the binary size of the character array in a PtNode 7194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 7294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * If only one character, this is the size of this character. If many, it's the sum of their 7394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * sizes + 1 byte for the terminator. 7494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 75576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param ptNode the PtNode 7694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @return the size of the char array, including the terminator if any 7794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 789168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto private static int getPtNodeCharactersSize(final PtNode ptNode, 799168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 809168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto return getPtNodeCharactersSize(ptNode.mChars, codePointToOneByteCodeMap); 8194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 8294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 8394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 84576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Compute the binary size of the PtNode count for a node array. 85af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param nodeArray the nodeArray 86576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @return the size of the PtNode count, either 1 or 2 bytes. 8794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 88576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada private static int getPtNodeCountSize(final PtNodeArray nodeArray) { 89576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return BinaryDictIOUtils.getPtNodeCountSize(nodeArray.mData.size()); 9094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 9194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 9294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 93576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Compute the maximum size of a PtNode, assuming 3-byte addresses for everything. 9494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 95576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param ptNode the PtNode to compute the size of. 96576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @return the maximum size of the PtNode. 9794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 989168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto private static int getPtNodeMaximumSize(final PtNode ptNode, 999168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 1009168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto int size = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); 101a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada if (ptNode.isTerminal()) { 10295d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi // If terminal, one byte for the frequency. 10395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi size += FormatSpec.PTNODE_FREQUENCY_SIZE; 104a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada } 105576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address 106576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mBigrams) { 107576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE 108576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE) 109576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * ptNode.mBigrams.size(); 11094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 11194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return size; 11294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 11394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 11494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 115576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Compute the maximum size of each PtNode of a PtNode array, assuming 3-byte addresses for 116af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * everything, and caches it in the `mCachedSize' member of the nodes; deduce the size of 117af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * the containing node array, and cache it it its 'mCachedSize' member. 11894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 119576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param ptNodeArray the node array to compute the maximum size of. 12094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 1219168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray, 1229168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 123576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int size = getPtNodeCountSize(ptNodeArray); 124576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (PtNode node : ptNodeArray.mData) { 1259168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final int nodeSize = getPtNodeMaximumSize(node, codePointToOneByteCodeMap); 126576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada node.mCachedSize = nodeSize; 127576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += nodeSize; 12894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 129576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNodeArray.mCachedSize = size; 13094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 13194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 13294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 133576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Compute the size of the header (flag + [parent address] + characters size) of a PtNode. 13494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 135576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param ptNode the PtNode of which to compute the size of the header 13694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 1379168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto private static int getNodeHeaderSize(final PtNode ptNode, 1389168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 1399168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode, 1409168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto codePointToOneByteCodeMap); 14194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 14294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 14394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 14494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * Compute the size, in bytes, that an address will occupy. 14594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 14694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * This can be used either for children addresses (which are always positive) or for 14794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * attribute, which may be positive or negative but 14894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * store their sign bit separately. 14994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 15094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param address the address 15194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @return the byte size. 15294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 15394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada static int getByteSize(final int address) { 15494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada assert(address <= FormatSpec.UINT24_MAX); 15594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (!BinaryDictIOUtils.hasChildrenAddress(address)) { 15694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 0; 15794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } else if (Math.abs(address) <= FormatSpec.UINT8_MAX) { 15894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 1; 15994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } else if (Math.abs(address) <= FormatSpec.UINT16_MAX) { 16094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 2; 16194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } else { 16294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 3; 16394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 16494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 16594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 1665f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka static int writeUIntToBuffer(final byte[] buffer, final int fromPosition, final int value, 1677547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada final int size) { 1685f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka int position = fromPosition; 1697547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada switch(size) { 1707547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada case 4: 1717547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada buffer[position++] = (byte) ((value >> 24) & 0xFF); 1727547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada /* fall through */ 1737547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada case 3: 1747547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada buffer[position++] = (byte) ((value >> 16) & 0xFF); 1757547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada /* fall through */ 1767547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada case 2: 1777547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada buffer[position++] = (byte) ((value >> 8) & 0xFF); 1787547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada /* fall through */ 1797547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada case 1: 1807547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada buffer[position++] = (byte) (value & 0xFF); 1817547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada break; 1827547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada default: 1837547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada /* nop */ 1847547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada } 1857547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada return position; 1867547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada } 1877547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada 188d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada static void writeUIntToStream(final OutputStream stream, final int value, final int size) 189d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada throws IOException { 190d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada switch(size) { 191d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada case 4: 192d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada stream.write((value >> 24) & 0xFF); 193d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada /* fall through */ 194d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada case 3: 195d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada stream.write((value >> 16) & 0xFF); 196d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada /* fall through */ 197d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada case 2: 198d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada stream.write((value >> 8) & 0xFF); 199d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada /* fall through */ 200d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada case 1: 201d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada stream.write(value & 0xFF); 202d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada break; 203d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada default: 204d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada /* nop */ 205d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada } 206d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada } 207d188af70227ae7003fa410ccf4038a57825ae385Yuichiro Hanada 20894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // End utility methods 20994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 21094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // This method is responsible for finding a nice ordering of the nodes that favors run-time 21194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // cache performance and dictionary size. 212af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard /* package for tests */ static ArrayList<PtNodeArray> flattenTree( 213af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final PtNodeArray rootNodeArray) { 214576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final int treeSize = FusionDictionary.countPtNodes(rootNodeArray); 21594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada MakedictLog.i("Counted nodes : " + treeSize); 216a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka final ArrayList<PtNodeArray> flatTree = new ArrayList<>(treeSize); 217af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return flattenTreeInner(flatTree, rootNodeArray); 21894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 21994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 220af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static ArrayList<PtNodeArray> flattenTreeInner(final ArrayList<PtNodeArray> list, 221576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNodeArray ptNodeArray) { 22294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Removing the node is necessary if the tails are merged, because we would then 22394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // add the same node several times when we only want it once. A number of places in 22494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // the code also depends on any node being only once in the list. 22594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Merging tails can only be done if there are no attributes. Searching for attributes 22694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // in LatinIME code depends on a total breadth-first ordering, which merging tails 22794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // breaks. If there are no attributes, it should be fine (and reduce the file size) 22894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // to merge tails, and removing the node from the list would be necessary. However, 22994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // we don't merge tails because breaking the breadth-first ordering would result in 23094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // extreme overhead at bigram lookup time (it would make the search function O(n) instead 23194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty 23294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // high). 23394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // If no nodes are ever merged, we can't have the same node twice in the list, hence 23494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // searching for duplicates in unnecessary. It is also very performance consuming, 23594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making 23694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // this simple list.remove operation O(n*n) overall. On Android this overhead is very 23794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // high. 23894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // For future reference, the code to remove duplicate is a simple : list.remove(node); 239576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada list.add(ptNodeArray); 240576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final ArrayList<PtNode> branches = ptNodeArray.mData; 241576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (PtNode ptNode : branches) { 242576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren); 24394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 24494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return list; 24594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 24694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 24794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 248af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Get the offset from a position inside a current node array to a target node array, during 249af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * update. 250af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * 251af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * If the current node array is before the target node array, the target node array has not 252af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * been updated yet, so we should return the offset from the old position of the current node 253af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * array to the old position of the target node array. If on the other hand the target is 254af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * before the current node array, it already has been updated, so we should return the offset 255af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * from the new position in the current node array to the new position in the target node 256af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * array. 257af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * 258576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param currentNodeArray node array containing the PtNode where the offset will be written 259af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray 260af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param targetNodeArray the target node array to get the offset to 261af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @return the offset to the target node array 26294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 263af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard private static int getOffsetToTargetNodeArrayDuringUpdate(final PtNodeArray currentNodeArray, 264af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int offsetFromStartOfCurrentNodeArray, final PtNodeArray targetNodeArray) { 265af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final boolean isTargetBeforeCurrent = (targetNodeArray.mCachedAddressBeforeUpdate 266af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard < currentNodeArray.mCachedAddressBeforeUpdate); 26794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (isTargetBeforeCurrent) { 268af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return targetNodeArray.mCachedAddressAfterUpdate 269af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard - (currentNodeArray.mCachedAddressAfterUpdate 270af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard + offsetFromStartOfCurrentNodeArray); 27194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 2725f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka return targetNodeArray.mCachedAddressBeforeUpdate 2735f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka - (currentNodeArray.mCachedAddressBeforeUpdate + offsetFromStartOfCurrentNodeArray); 27494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 27594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 27694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 277576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Get the offset from a position inside a current node array to a target PtNode, during 278af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * update. 279af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * 280576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param currentNodeArray node array containing the PtNode where the offset will be written 281af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray 282576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param targetPtNode the target PtNode to get the offset to 283576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @return the offset to the target PtNode 28494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 28594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // TODO: is there any way to factorize this method with the one above? 286576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada private static int getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray, 287576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode) { 288af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int oldOffsetBasePoint = currentNodeArray.mCachedAddressBeforeUpdate 289af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard + offsetFromStartOfCurrentNodeArray; 290576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final boolean isTargetBeforeCurrent = (targetPtNode.mCachedAddressBeforeUpdate 29194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada < oldOffsetBasePoint); 292af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard // If the target is before the current node array, then its address has already been 293af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard // updated. We can use the AfterUpdate member, and compare it to our own member after 294af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard // update. Otherwise, the AfterUpdate member is not updated yet, so we need to use the 295af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard // BeforeUpdate member, and of course we have to compare this to our own address before 296af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard // update. 29794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (isTargetBeforeCurrent) { 298af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard final int newOffsetBasePoint = currentNodeArray.mCachedAddressAfterUpdate 299af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard + offsetFromStartOfCurrentNodeArray; 300576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada return targetPtNode.mCachedAddressAfterUpdate - newOffsetBasePoint; 30194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 3025f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka return targetPtNode.mCachedAddressBeforeUpdate - oldOffsetBasePoint; 30394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 30494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 30594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 306af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Computes the actual node array size, based on the cached addresses of the children nodes. 30794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 308af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Each node array stores its tentative address. During dictionary address computing, these 309af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * are not final, but they can be used to compute the node array size (the node array size 310af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * depends on the address of the children because the number of bytes necessary to store an 311af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * address depends on its numeric value. The return value indicates whether the node array 31294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * contents (as in, any of the addresses stored in the cache fields) have changed with 31394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * respect to their previous value. 31494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 315576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param ptNodeArray the node array to compute the size of. 31694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param dict the dictionary in which the word/attributes are to be found. 317af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @return false if none of the cached addresses inside the node array changed, true otherwise. 31894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 319576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, 3209168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final FusionDictionary dict, 3219168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 32294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada boolean changed = false; 323576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int size = getPtNodeCountSize(ptNodeArray); 324576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (PtNode ptNode : ptNodeArray.mData) { 325576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNode.mCachedAddressAfterUpdate = ptNodeArray.mCachedAddressAfterUpdate + size; 326576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) { 32794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada changed = true; 32894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 3299168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto int nodeSize = getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); 330a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada if (ptNode.isTerminal()) { 33195d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; 332a141d8ef7dcf8f942eb7bd4ca006f63da1744319Yuichiro Hanada } 33395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi if (null != ptNode.mChildren) { 334b50a7a4bc718f3611ac1e97a940d5a59b4b0133eYuichiro Hanada nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, 335b50a7a4bc718f3611ac1e97a940d5a59b4b0133eYuichiro Hanada nodeSize + size, ptNode.mChildren)); 33694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 33795d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi if (null != ptNode.mBigrams) { 33895d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi for (WeightedString bigram : ptNode.mBigrams) { 33995d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, 34095d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE, 34195d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord)); 34295d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; 34394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 34494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 345576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNode.mCachedSize = nodeSize; 346576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size += nodeSize; 34794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 348576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNodeArray.mCachedSize != size) { 349576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNodeArray.mCachedSize = size; 35094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada changed = true; 35194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 35294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return changed; 35394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 35494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 35594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 356af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Initializes the cached addresses of node arrays and their containing nodes from their size. 35794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 358af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param flatNodes the list of node arrays. 35994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @return the byte size of the entire stack. 36094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 36195d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi private static int initializePtNodeArraysCachedAddresses( 36295d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi final ArrayList<PtNodeArray> flatNodes) { 363af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard int nodeArrayOffset = 0; 364af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard for (final PtNodeArray nodeArray : flatNodes) { 365af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard nodeArray.mCachedAddressBeforeUpdate = nodeArrayOffset; 366576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int nodeCountSize = getPtNodeCountSize(nodeArray); 367576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int nodeffset = 0; 368576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (final PtNode ptNode : nodeArray.mData) { 369576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate = 370576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada nodeCountSize + nodeArrayOffset + nodeffset; 371576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada nodeffset += ptNode.mCachedSize; 37294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 373af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard nodeArrayOffset += nodeArray.mCachedSize; 37494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 375af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard return nodeArrayOffset; 37694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 37794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 37894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 379af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Updates the cached addresses of node arrays after recomputing their new positions. 38094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 381af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param flatNodes the list of node arrays. 38294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 383576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada private static void updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) { 384af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard for (final PtNodeArray nodeArray : flatNodes) { 385af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard nodeArray.mCachedAddressBeforeUpdate = nodeArray.mCachedAddressAfterUpdate; 386576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (final PtNode ptNode : nodeArray.mData) { 387576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate; 38894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 38994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 39094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 39194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 39294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 393576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Compute the addresses and sizes of an ordered list of PtNode arrays. 39494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 395576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * This method takes a list of PtNode arrays and will update their cached address and size 396af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * values so that they can be written into a file. It determines the smallest size each of the 397576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * PtNode arrays can be given the addresses of its children and attributes, and store that into 398576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * each PtNode. 399576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * The order of the PtNode is given by the order of the array. This method makes no effort 40094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * to find a good order; it only mechanically computes the size this order results in. 40194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 40294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param dict the dictionary 403576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param flatNodes the ordered list of PtNode arrays 40494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @return the same array it was passed. The nodes have been updated for address and size. 40594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 40670e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict, 4079168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final ArrayList<PtNodeArray> flatNodes, 4089168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 40994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // First get the worst possible sizes and offsets 41095d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi for (final PtNodeArray n : flatNodes) { 4119168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto calculatePtNodeArrayMaximumSize(n, codePointToOneByteCodeMap); 41295d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi } 41395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi final int offset = initializePtNodeArraysCachedAddresses(flatNodes); 41494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 41594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada MakedictLog.i("Compressing the array addresses. Original size : " + offset); 41694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada MakedictLog.i("(Recursively seen size : " + offset + ")"); 41794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 41894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada int passes = 0; 41994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada boolean changesDone = false; 42094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada do { 42194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada changesDone = false; 422576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int ptNodeArrayStartOffset = 0; 423576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (final PtNodeArray ptNodeArray : flatNodes) { 424576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset; 425576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final int oldNodeArraySize = ptNodeArray.mCachedSize; 4269168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict, 4279168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto codePointToOneByteCodeMap); 428576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final int newNodeArraySize = ptNodeArray.mCachedSize; 429af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard if (oldNodeArraySize < newNodeArraySize) { 430af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard throw new RuntimeException("Increased size ?!"); 431af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard } 432576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNodeArrayStartOffset += newNodeArraySize; 43394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada changesDone |= changed; 43494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 435576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada updatePtNodeArraysCachedAddresses(flatNodes); 43694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada ++passes; 43794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); 43894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } while (changesDone); 43994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 440576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1); 44194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada MakedictLog.i("Compression complete in " + passes + " passes."); 44294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada MakedictLog.i("After address compression : " 443576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + (lastPtNodeArray.mCachedAddressAfterUpdate + lastPtNodeArray.mCachedSize)); 44494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 44594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return flatNodes; 44694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 44794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 44894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 44994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * Sanity-checking method. 45094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 451576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * This method checks a list of PtNode arrays for juxtaposition, that is, it will do 452af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * nothing if each node array's cached address is actually the previous node array's address 45394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * plus the previous node's size. 45494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * If this is not the case, it will throw an exception. 45594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 456af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param arrays the list of node arrays to check 45794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 45870e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada /* package */ static void checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays) { 45994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada int offset = 0; 46094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada int index = 0; 461576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (final PtNodeArray ptNodeArray : arrays) { 46294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter 46394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // which we use. 464576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNodeArray.mCachedAddressAfterUpdate != offset) { 46594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada throw new RuntimeException("Wrong address for node " + index 466576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + " : expected " + offset + ", got " + 467576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ptNodeArray.mCachedAddressAfterUpdate); 46894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 46994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada ++index; 470576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada offset += ptNodeArray.mCachedSize; 47194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 47294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 47394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 47494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 475563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada * Helper method to write a children position to a file. 47694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 47794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param buffer the buffer to write to. 478d3a4c5132422b189c8dbb94dbbe84a9b9761b0a8Tadashi G. Takaoka * @param fromIndex the index in the buffer to write the address to. 479563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada * @param position the position to write. 48094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @return the size in bytes the address actually took. 48194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 4825f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka /* package */ static int writeChildrenPosition(final byte[] buffer, final int fromIndex, 48370e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada final int position) { 4845f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka int index = fromIndex; 485563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada switch (getByteSize(position)) { 48694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada case 1: 487563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada buffer[index++] = (byte)position; 48894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 1; 48994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada case 2: 490563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada buffer[index++] = (byte)(0xFF & (position >> 8)); 491563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada buffer[index++] = (byte)(0xFF & position); 49294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 2; 49394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada case 3: 494563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada buffer[index++] = (byte)(0xFF & (position >> 16)); 495563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada buffer[index++] = (byte)(0xFF & (position >> 8)); 496563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada buffer[index++] = (byte)(0xFF & position); 49794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 3; 49894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada case 0: 49994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 0; 50094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada default: 501563bcb511771579d272108f9659f85a71db98dabYuichiro Hanada throw new RuntimeException("Position " + position + " has a strange size"); 50294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 50394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 50494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 50594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 506576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Makes the flag value for a PtNode. 50794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 508576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param hasMultipleChars whether the PtNode has multiple chars. 509576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param isTerminal whether the PtNode is terminal. 51094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param childrenAddressSize the size of a children address. 511576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param hasBigrams whether the PtNode has bigrams. 512576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param isNotAWord whether the PtNode is not a word. 51305172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu * @param isPossiblyOffensive whether the PtNode is a possibly offensive entry. 51494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @return the flags 51594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 516576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal, 51712d80ebead6a1d7f704a5a3af3b6fe3313ceab05Dan Zivkovic final int childrenAddressSize, final boolean hasBigrams, 51805172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu final boolean isNotAWord, final boolean isPossiblyOffensive) { 51994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada byte flags = 0; 52094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; 52194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; 52295d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi switch (childrenAddressSize) { 52395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi case 1: 52495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE; 52595d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi break; 52695d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi case 2: 52795d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES; 52895d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi break; 52995d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi case 3: 53095d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES; 53195d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi break; 53295d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi case 0: 53395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS; 53495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi break; 53595d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi default: 53695d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi throw new RuntimeException("Node with a strange address"); 53794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 53894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS; 53994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD; 54005172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu if (isPossiblyOffensive) flags |= FormatSpec.FLAG_IS_POSSIBLY_OFFENSIVE; 54194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return flags; 54294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 54394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 54495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi /* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset) { 5458ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi return (byte) makePtNodeFlags(node.mChars.length > 1, node.isTerminal(), 546aa4168ee09e8bff6d4a27041566fe79f71cdbcf5Yuichiro Hanada getByteSize(childrenOffset), 54795d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi node.mBigrams != null && !node.mBigrams.isEmpty(), 54805172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu node.mIsNotAWord, node.mIsPossiblyOffensive); 54994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 55094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 55194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 55294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * Makes the flag value for a bigram. 55394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 55494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param more whether there are more bigrams after this one. 55594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param offset the offset of the bigram. 55694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param bigramFrequency the frequency of the bigram, 0..255. 55794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param unigramFrequency the unigram frequency of the same word, 0..255. 55894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param word the second bigram, for debugging purposes 55994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @return the flags 56094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 56112d80ebead6a1d7f704a5a3af3b6fe3313ceab05Dan Zivkovic /* package */ static int makeBigramFlags(final boolean more, final int offset, 5625f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka final int bigramFrequency, final int unigramFrequency, final String word) { 563576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int bigramFlags = (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0) 564576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + (offset < 0 ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0); 56594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada switch (getByteSize(offset)) { 56694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada case 1: 567576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE; 56894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada break; 56994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada case 2: 570576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES; 57194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada break; 57294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada case 3: 573576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES; 57494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada break; 57594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada default: 57694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada throw new RuntimeException("Strange offset size"); 57794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 5785f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka final int frequency; 57994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada if (unigramFrequency > bigramFrequency) { 58094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word 58194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " 58294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada + word + " is " + unigramFrequency); 5835f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka frequency = unigramFrequency; 5845f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka } else { 5855f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka frequency = bigramFrequency; 58694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 5875f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka bigramFlags += getBigramFrequencyDiff(unigramFrequency, frequency) 5882fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY; 5892fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return bigramFlags; 5902fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 5912fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 5922fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa public static int getBigramFrequencyDiff(final int unigramFrequency, 5932fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa final int bigramFrequency) { 59494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // We compute the difference between 255 (which means probability = 1) and the 59594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // unigram score. We split this into a number of discrete steps. 59694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 59794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // represents an increase of 16 steps: a value of 15 will be interpreted as the median 59894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // value of the 16th step. In all justice, if the bigram frequency is low enough to be 59994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // rounded below the first step (which means it is less than half a step higher than the 60094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // unigram frequency) then the unigram frequency itself is the best approximation of the 60194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // bigram freq that we could possibly supply, hence we should *not* include this bigram 60294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // in the file at all. 60394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // until this is done, we'll write 0 and slightly overestimate this case. 60494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step 60594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to 60694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the 60794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // step size. Then we compute the start of the first step (the one where value 0 starts) 60894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // by adding half-a-step to the unigramFrequency. From there, we compute the integer 60994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // number of steps to the bigramFrequency. One last thing: we want our steps to include 61094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // their lower bound and exclude their higher bound so we need to have the first step 61194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // start at exactly 1 unit higher than floor(unigramFreq + half a step). 61294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Note : to reconstruct the score, the dictionary reader will need to divide 61394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step, 61494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best 61594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the 61694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // step pointed by the discretized frequency. 61794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final float stepSize = 61894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) 61994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); 62094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); 62194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); 62294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 62394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // here. The best approximation would be the unigram freq itself, so we should not 62494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // include this bigram in the dictionary. For now, register as 0, and live with the 62594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // small over-estimation that we get in this case. TODO: actually remove this bigram 62694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // if discretizedFrequency < 0. 6272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return discretizedFrequency > 0 ? discretizedFrequency : 0; 62894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 62994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 63012d80ebead6a1d7f704a5a3af3b6fe3313ceab05Dan Zivkovic /* package */ static int getChildrenPosition(final PtNode ptNode, 6319168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 6326156892b7e19aed29475c3ff6e439b05cf0b14c4Yuichiro Hanada int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate 6339168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto + getNodeHeaderSize(ptNode, codePointToOneByteCodeMap); 634d0c87576ffad858b13063506b15ca96293319bdbYuichiro Hanada if (ptNode.isTerminal()) { 63595d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi // A terminal node has the frequency. 636d0c87576ffad858b13063506b15ca96293319bdbYuichiro Hanada // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children 637d0c87576ffad858b13063506b15ca96293319bdbYuichiro Hanada // position. 63895d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE; 6396156892b7e19aed29475c3ff6e439b05cf0b14c4Yuichiro Hanada } 6406156892b7e19aed29475c3ff6e439b05cf0b14c4Yuichiro Hanada return null == ptNode.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS 6416156892b7e19aed29475c3ff6e439b05cf0b14c4Yuichiro Hanada : ptNode.mChildren.mCachedAddressAfterUpdate - positionOfChildrenPosField; 6426156892b7e19aed29475c3ff6e439b05cf0b14c4Yuichiro Hanada } 6436156892b7e19aed29475c3ff6e439b05cf0b14c4Yuichiro Hanada 64494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 6457547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada * Write a PtNodeArray. The PtNodeArray is expected to have its final position cached. 64694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 647af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @param dict the dictionary the node array is a part of (for relative offsets). 64870e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada * @param dictEncoder the dictionary encoder. 649576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param ptNodeArray the node array to write. 6509168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto * @param codePointToOneByteCodeMap the map to convert the code points. 65194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 6527547a7042c2f38adbf880637af07b8d559488799Yuichiro Hanada /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, 6539168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final DictEncoder dictEncoder, final PtNodeArray ptNodeArray, 6549168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final HashMap<Integer, Integer> codePointToOneByteCodeMap) { 655576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // TODO: Make the code in common with BinaryDictIOUtils#writePtNode 65670e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate); 65794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 658576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final int ptNodeCount = ptNodeArray.mData.size(); 65970e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada dictEncoder.writePtNodeCount(ptNodeCount); 660576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (int i = 0; i < ptNodeCount; ++i) { 661576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final PtNode ptNode = ptNodeArray.mData.get(i); 66270e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada if (dictEncoder.getPosition() != ptNode.mCachedAddressAfterUpdate) { 66394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada throw new RuntimeException("Bug: write index is not the same as the cached address " 66470e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada + "of the node : " + dictEncoder.getPosition() + " <> " 66570e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada + ptNode.mCachedAddressAfterUpdate); 66694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 66794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Sanity checks. 6688ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi if (DBG && ptNode.getProbability() > FormatSpec.MAX_TERMINAL_FREQUENCY) { 66994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada throw new RuntimeException("A node has a frequency > " 67094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada + FormatSpec.MAX_TERMINAL_FREQUENCY 6718ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi + " : " + ptNode.mProbabilityInfo.toString()); 67294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 6739168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto dictEncoder.writePtNode(ptNode, dict, codePointToOneByteCodeMap); 67494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 67570e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate 67670e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada + ptNodeArray.mCachedSize) { 67770e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada throw new RuntimeException("Not the same size : written " 67870e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada + (dictEncoder.getPosition() - ptNodeArray.mCachedAddressAfterUpdate) 679576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + " bytes from a node that should have " + ptNodeArray.mCachedSize + " bytes"); 680af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard } 68194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 68294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 68394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 684576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Dumps a collection of useful statistics about a list of PtNode arrays. 68594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 68694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * This prints purely informative stuff, like the total estimated file size, the 687576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * number of PtNode arrays, of PtNodes, the repartition of each address size, etc 68894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 689576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param ptNodeArrays the list of PtNode arrays. 69094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 69170e81af72678d95c2a38412c478e06837a51c7cbYuichiro Hanada /* package */ static void showStatistics(ArrayList<PtNodeArray> ptNodeArrays) { 69294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada int firstTerminalAddress = Integer.MAX_VALUE; 69394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada int lastTerminalAddress = Integer.MIN_VALUE; 69494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada int size = 0; 695576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int ptNodes = 0; 696576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada int maxNodes = 0; 69794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada int maxRuns = 0; 698576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (final PtNodeArray ptNodeArray : ptNodeArrays) { 699576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (maxNodes < ptNodeArray.mData.size()) maxNodes = ptNodeArray.mData.size(); 700576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (final PtNode ptNode : ptNodeArray.mData) { 701576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ++ptNodes; 702576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length; 7038ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi if (ptNode.isTerminal()) { 704576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress) 705576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; 706576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress) 707576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada lastTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; 70894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 70994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 710576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize > size) { 711576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada size = ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize; 71294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 71394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 714576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada final int[] ptNodeCounts = new int[maxNodes + 1]; 71594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final int[] runCounts = new int[maxRuns + 1]; 716576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (final PtNodeArray ptNodeArray : ptNodeArrays) { 717576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ++ptNodeCounts[ptNodeArray.mData.size()]; 718576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (final PtNode ptNode : ptNodeArray.mData) { 719576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada ++runCounts[ptNode.mChars.length]; 72094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 72194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 72294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 72394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada MakedictLog.i("Statistics:\n" 724c21619fe739216af3b134308d20669ad7431c250Akifumi Yoshimoto + " Total file size " + size + "\n" 725576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + " " + ptNodeArrays.size() + " node arrays\n" 726576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + " " + ptNodes + " PtNodes (" + ((float)ptNodes / ptNodeArrays.size()) 727576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + " PtNodes per node)\n" 728c21619fe739216af3b134308d20669ad7431c250Akifumi Yoshimoto + " First terminal at " + firstTerminalAddress + "\n" 729c21619fe739216af3b134308d20669ad7431c250Akifumi Yoshimoto + " Last terminal at " + lastTerminalAddress + "\n" 730576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + " PtNode stats : max = " + maxNodes); 73194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 73294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 73394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada /** 73455f5f7a005c00ec764ed19647b245e48636a0440Yuichiro Hanada * Writes a file header to an output stream. 73594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * 73655f5f7a005c00ec764ed19647b245e48636a0440Yuichiro Hanada * @param destination the stream to write the file header to. 73794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param dict the dictionary to write. 73894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada * @param formatOptions file format options. 7398a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto * @param codePointOccurrenceArray code points ordered by occurrence count. 74022c5c450fecb856100059f4e5b34b847fb0acfa7Yuichiro Hanada * @return the size of the header. 74194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada */ 74222c5c450fecb856100059f4e5b34b847fb0acfa7Yuichiro Hanada /* package */ static int writeDictionaryHeader(final OutputStream destination, 7438a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto final FusionDictionary dict, final FormatOptions formatOptions, 7448a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) 74555f5f7a005c00ec764ed19647b245e48636a0440Yuichiro Hanada throws IOException, UnsupportedFormatException { 74694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final int version = formatOptions.mVersion; 74788480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu if ((version >= FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION && 74888480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu version <= FormatSpec.MAXIMUM_SUPPORTED_STATIC_VERSION) || ( 74988480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu version >= FormatSpec.MINIMUM_SUPPORTED_DYNAMIC_VERSION && 75088480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu version <= FormatSpec.MAXIMUM_SUPPORTED_DYNAMIC_VERSION)) { 75188480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu // Dictionary is valid 75288480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu } else { 75394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada throw new UnsupportedFormatException("Requested file format version " + version 75488480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu + ", but this implementation only supports static versions " 75588480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu + FormatSpec.MINIMUM_SUPPORTED_STATIC_VERSION + " through " 75688480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu + FormatSpec.MAXIMUM_SUPPORTED_STATIC_VERSION + " and dynamic versions " 75788480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu + FormatSpec.MINIMUM_SUPPORTED_DYNAMIC_VERSION + " through " 75888480f39f6f5d1d678983d1e828bfe7196b021c4Adrian Velicu + FormatSpec.MAXIMUM_SUPPORTED_DYNAMIC_VERSION); 75994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 76094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 76194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); 76294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 76394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // The magic number in big-endian order. 76494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Magic number for all versions. 76594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 24))); 76694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 16))); 76794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 8))); 76894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write((byte) (0xFF & FormatSpec.MAGIC_NUMBER)); 76994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Dictionary version. 77094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write((byte) (0xFF & (version >> 8))); 77194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write((byte) (0xFF & version)); 77294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 77394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Options flags 77426bd46095a05843e7574dfcf7db53406f215525dKeisuke Kuroyanagi // TODO: Remove this field. 77526bd46095a05843e7574dfcf7db53406f215525dKeisuke Kuroyanagi final int options = 0; 77694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write((byte) (0xFF & (options >> 8))); 77794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write((byte) (0xFF & options)); 77894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final int headerSizeOffset = headerBuffer.size(); 77994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Placeholder to be written later with header size. 78094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada for (int i = 0; i < 4; ++i) { 78194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.write(0); 78294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 78394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Write out the options. 78494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada for (final String key : dict.mOptions.mAttributes.keySet()) { 78594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final String value = dict.mOptions.mAttributes.get(key); 7869168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto CharEncoding.writeString(headerBuffer, key, null); 7879168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto CharEncoding.writeString(headerBuffer, value, null); 7889168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto } 7899168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto // Write out the codePointTable if there is codePointOccurrenceArray. 7909168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto if (codePointOccurrenceArray != null) { 7919168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final String codePointTableString = 7929168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto encodeCodePointTable(codePointOccurrenceArray); 7939168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto CharEncoding.writeString(headerBuffer, DictionaryHeader.CODE_POINT_TABLE_KEY, null); 7949168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto CharEncoding.writeString(headerBuffer, codePointTableString, null); 79594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 79694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final int size = headerBuffer.size(); 79794460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada final byte[] bytes = headerBuffer.toByteArray(); 79894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada // Write out the header size. 79994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); 80094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); 80194460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); 80294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); 80394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada destination.write(bytes); 80494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 80594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada headerBuffer.close(); 80622c5c450fecb856100059f4e5b34b847fb0acfa7Yuichiro Hanada return size; 80755f5f7a005c00ec764ed19647b245e48636a0440Yuichiro Hanada } 8088a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto 8098a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto static final class CodePointTable { 8108a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto final HashMap<Integer, Integer> mCodePointToOneByteCodeMap; 8118a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto final ArrayList<Entry<Integer, Integer>> mCodePointOccurrenceArray; 8128a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto 8139168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto // Let code point table empty for version 200 dictionary which used in test 8149168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto CodePointTable() { 8159168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto mCodePointToOneByteCodeMap = null; 8169168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto mCodePointOccurrenceArray = null; 8179168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto } 8189168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto 8198a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto CodePointTable(final HashMap<Integer, Integer> codePointToOneByteCodeMap, 8208a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { 8218a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto mCodePointToOneByteCodeMap = codePointToOneByteCodeMap; 8228a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto mCodePointOccurrenceArray = codePointOccurrenceArray; 8238a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto } 8248a6e96d28645ce325a38423af6967a011edefc9dAkifumi Yoshimoto } 8259168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto 8269168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto private static String encodeCodePointTable( 8279168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final ArrayList<Entry<Integer, Integer>> codePointOccurrenceArray) { 8289168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto final StringBuilder codePointTableString = new StringBuilder(); 8299168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto int currentCodePointTableIndex = FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE; 8309168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto for (final Entry<Integer, Integer> entry : codePointOccurrenceArray) { 8319168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto // Native reads the table as a string 8329168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto codePointTableString.appendCodePoint(entry.getKey()); 8339168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto if (FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE < ++currentCodePointTableIndex) { 8349168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto break; 8359168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto } 8369168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto } 8379168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto return codePointTableString.toString(); 8389168ab60cf08385554a7a8255e40698988ee37f6Akifumi Yoshimoto } 83994460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada} 840