165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada/*
265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * Copyright (C) 2012 The Android Open Source Project
365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada *
48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License");
58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License.
68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at
765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada *
88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka *      http://www.apache.org/licenses/LICENSE-2.0
965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada *
1065feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * Unless required by applicable law or agreed to in writing, software
118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS,
128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and
148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License.
1565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada */
1665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
1765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadapackage com.android.inputmethod.latin.makedict;
1865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
1915f6d4ae34664ea3d92827a2c3003198c0bac70bTadashi G. Takaokaimport com.android.inputmethod.annotations.UsedForTesting;
200f7d881dc72132dfd75c8b4fe61a69fc5cdcd460Mohammadinamul Sheikimport com.android.inputmethod.latin.define.DecoderSpecificConstants;
2193cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagiimport com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
2265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
2393cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagiimport java.io.File;
2465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadaimport java.io.IOException;
25fb7e08ea8f40ebb4ac57e443f837043e7b57fd87Yuichiro Hanadaimport java.io.OutputStream;
2665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadaimport java.util.ArrayList;
2765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadaimport java.util.Map;
2865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadaimport java.util.Stack;
2965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
30a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaokapublic final class BinaryDictIOUtils {
3165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada    private static final boolean DBG = false;
32c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada
33c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada    private BinaryDictIOUtils() {
34c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada        // This utility class is not publicly instantiable.
35c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada    }
3665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
3793cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi    /**
3893cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi     * Returns new dictionary decoder.
3993cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi     *
4093cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi     * @param dictFile the dictionary file.
4193cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi     * @param bufferType The type of buffer, as one of USE_* in DictDecoder.
4293cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi     * @return new dictionary decoder if the dictionary file exists, otherwise null.
4393cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi     */
4493cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
4593cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi            final long length, final int bufferType) {
46c15bbb52a37be751fed2ba7e765dfd7727306308Dan Zivkovic        return new Ver4DictDecoder(dictFile);
4793cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi    }
4893cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi
4993cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
5093cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi            final long length, final DictionaryBufferFactory factory) {
51c15bbb52a37be751fed2ba7e765dfd7727306308Dan Zivkovic        return new Ver4DictDecoder(dictFile);
5293cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi    }
5393cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi
5493cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
5593cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi            final long length) {
5693cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi        return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER);
5793cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi    }
5893cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi
59a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka    private static final class Position {
60576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada        public static final int NOT_READ_PTNODE_COUNT = -1;
6165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
6265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        public int mAddress;
63576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada        public int mNumOfPtNode;
6465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        public int mPosition;
6565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        public int mLength;
6665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
6765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        public Position(int address, int length) {
6865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            mAddress = address;
6965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            mLength = length;
70576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada            mNumOfPtNode = NOT_READ_PTNODE_COUNT;
7165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        }
7265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada    }
7365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
7465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada    /**
75af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard     * Retrieves all node arrays without recursive call.
7665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada     */
77be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada    private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
782fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            final int bodyOffset, final Map<Integer, String> words,
79be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada            final Map<Integer, Integer> frequencies,
8095d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi            final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
8165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
8265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
83a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka        Stack<Position> stack = new Stack<>();
8465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        int index = 0;
8565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        Position initPos = new Position(bodyOffset, 0);
8765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        stack.push(initPos);
8865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
8965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        while (!stack.empty()) {
9065feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            Position p = stack.peek();
9165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
9265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            if (DBG) {
93576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada                MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
94576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada                        p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
9565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            }
9665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
97be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada            if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress);
9865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            if (index != p.mLength) index = p.mLength;
9965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
100576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada            if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
101be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada                p.mNumOfPtNode = dictDecoder.readPtNodeCount();
10256e7e38d372c726724c80246e6187dc1241ecfeaKeisuke Kuroyanagi                p.mAddress = dictDecoder.getPosition();
10365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada                p.mPosition = 0;
10465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            }
105576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada            if (p.mNumOfPtNode == 0) {
10684d858ed5e187eb9d4b56b593e1d9287f762bbcaYuichiro Hanada                stack.pop();
10784d858ed5e187eb9d4b56b593e1d9287f762bbcaYuichiro Hanada                continue;
10884d858ed5e187eb9d4b56b593e1d9287f762bbcaYuichiro Hanada            }
10995d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi            final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress);
1108ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi            for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
1118ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                pushedChars[index++] = ptNodeInfo.mCharacters[i];
11265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            }
11365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            p.mPosition++;
11495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi            if (ptNodeInfo.isTerminal()) {// found word
1158ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
1168ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                frequencies.put(
1178ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                        ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
1188ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                if (ptNodeInfo.mBigrams != null) {
1198ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                    bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
1208ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                }
12165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            }
12265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
123576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada            if (p.mPosition == p.mNumOfPtNode) {
12495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi                stack.pop();
12565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            } else {
12695d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi                // The PtNode array has more PtNodes.
127be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada                p.mAddress = dictDecoder.getPosition();
12865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            }
12965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
13095d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi            if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
1318ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
13265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada                stack.push(childrenPos);
13365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            }
13465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        }
13565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada    }
13665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada
13765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada    /**
13865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada     * Reads unigrams and bigrams from the binary file.
139af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard     * Doesn't store a full memory representation of the dictionary.
14065feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada     *
14177bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada     * @param dictDecoder the dict decoder.
14265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada     * @param words the map to store the address as a key and the word as a value.
14365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada     * @param frequencies the map to store the address as a key and the frequency as a value.
14465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada     * @param bigrams the map to store the address as a key and the list of address as a value.
145af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard     * @throws IOException if the file can't be read.
146af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard     * @throws UnsupportedFormatException if the format of the file is not recognized.
14765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada     */
1480e40cd0c40f2c731f91ccd0561e251262e5a2614Yuichiro Hanada    /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder,
14965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
15065feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
15165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada            UnsupportedFormatException {
15265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada        // Read header
153b986f78ba826fa360304a69565f1880bdd7ce0c5Keisuke Kuroyanagi        final DictionaryHeader header = dictDecoder.readHeader();
1542fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
15595d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi            frequencies, bigrams);
15665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada    }
157d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada
158d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada    /**
159576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada     * Gets the address of the last PtNode of the exact matching word in the dictionary.
160d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada     * If no match is found, returns NOT_VALID_WORD.
161d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada     *
16277bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada     * @param dictDecoder the dict decoder.
163d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada     * @param word the word we search for.
164d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada     * @return the address of the terminal node.
165af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard     * @throws IOException if the file can't be read.
166af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard     * @throws UnsupportedFormatException if the format of the file is not recognized.
167d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada     */
16815f6d4ae34664ea3d92827a2c3003198c0bac70bTadashi G. Takaoka    @UsedForTesting
1690e40cd0c40f2c731f91ccd0561e251262e5a2614Yuichiro Hanada    /* package */ static int getTerminalPosition(final DictDecoder dictDecoder,
170d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada            final String word) throws IOException, UnsupportedFormatException {
171d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada        if (word == null) return FormatSpec.NOT_VALID_WORD;
172be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada        dictDecoder.setPosition(0);
17395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi        dictDecoder.readHeader();
174d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada        int wordPos = 0;
175d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada        final int wordLen = word.codePointCount(0, word.length());
1760f7d881dc72132dfd75c8b4fe61a69fc5cdcd460Mohammadinamul Sheik        for (int depth = 0; depth < DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; ++depth) {
177d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada            if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
17893d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada
17993d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada            do {
180be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada                final int ptNodeCount = dictDecoder.readPtNodeCount();
181576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada                boolean foundNextPtNode = false;
182576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada                for (int i = 0; i < ptNodeCount; ++i) {
183be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada                    final int ptNodePos = dictDecoder.getPosition();
18495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi                    final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos);
18593d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                    boolean same = true;
18693d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                    for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
18793d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                            p < currentInfo.mCharacters.length;
18893d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                            ++p, j = word.offsetByCodePoints(j, 1)) {
18993d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                        if (wordPos + p >= wordLen
19093d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                                || word.codePointAt(j) != currentInfo.mCharacters[p]) {
19193d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                            same = false;
19293d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                            break;
19393d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                        }
194d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada                    }
195d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada
19693d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                    if (same) {
197576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada                        // found the PtNode matches the word.
19893d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                        if (wordPos + currentInfo.mCharacters.length == wordLen) {
1995f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka                            return currentInfo.isTerminal() ? ptNodePos : FormatSpec.NOT_VALID_WORD;
20093d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                        }
20193d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                        wordPos += currentInfo.mCharacters.length;
20293d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                        if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
203d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada                            return FormatSpec.NOT_VALID_WORD;
204d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada                        }
205576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada                        foundNextPtNode = true;
206be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada                        dictDecoder.setPosition(currentInfo.mChildrenAddress);
20793d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada                        break;
208d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada                    }
209d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada                }
210576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada                if (foundNextPtNode) break;
21195d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi                return FormatSpec.NOT_VALID_WORD;
21293d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada            } while(true);
213d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada        }
214d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada        return FormatSpec.NOT_VALID_WORD;
215d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada    }
21666597f5e5f3249f418665c1990fb539d2f5565d5Yuichiro Hanada
2177f438aa12f683ac15c8c8e60ca852412d00128dbYuichiro Hanada    /**
2182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa     * Writes a PtNodeCount to the stream.
2192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa     *
2202fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa     * @param destination the stream to write.
2212fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa     * @param ptNodeCount the count.
2222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa     * @return the size written in bytes.
2232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa     */
2244fdcefe504a4f8e832a75be2c7280ea8a5e390d3Keisuke Kuroyanagi    @UsedForTesting
2252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
2262fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            throws IOException {
2272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
2282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        // the count must fit on one byte or two bytes.
2292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        // Please see comments in FormatSpec.
2302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        if (countSize != 1 && countSize != 2) {
2312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
2322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        }
2332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        final int encodedPtNodeCount = (countSize == 2) ?
2342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa                (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
2352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
2362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa        return countSize;
2372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa    }
2382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa
2391db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada    /**
2401db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada     * Helper method to hide the actual value of the no children address.
2411db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada     */
2421db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada    public static boolean hasChildrenAddress(final int address) {
2431db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada        return FormatSpec.NO_CHILDREN_ADDRESS != address;
2441db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada    }
2451db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada
2461db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada    /**
247576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada     * Compute the binary size of the node count
248576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada     * @param count the node count
249576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada     * @return the size of the node count, either 1 or 2 bytes.
2501db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada     */
251576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada    public static int getPtNodeCountSize(final int count) {
252576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada        if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
2531db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada            return 1;
254576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada        } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
2551db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada            return 2;
2561db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada        } else {
2571db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada            throw new RuntimeException("Can't have more than "
258576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada                    + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
259af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard                    + count + ")");
2601db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada        }
2611db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada    }
26294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada
26395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi    static int getChildrenAddressSize(final int optionFlags) {
264576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada        switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
265576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
26694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada                return 1;
267576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
26894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada                return 2;
269576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
27094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada                return 3;
271576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
27294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada            default:
27394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada                return 0;
27494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada        }
27594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada    }
27677bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada
27777bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada    /**
27877bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada     * Calculate bigram frequency from compressed value
27977bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada     *
28077bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada     * @param unigramFrequency
28177bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada     * @param bigramFrequency compressed frequency
28277bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada     * @return approximate bigram frequency
28377bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada     */
28495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi    @UsedForTesting
28577bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada    public static int reconstructBigramFrequency(final int unigramFrequency,
28677bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada            final int bigramFrequency) {
28777bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada        final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
28877bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada                / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
28977bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada        final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
29077bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada        return (int)resultFreqFloat;
29177bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada    }
29265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada}
293