165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada/* 265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * Copyright (C) 2012 The Android Open Source Project 365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * 48aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * Licensed under the Apache License, Version 2.0 (the "License"); 58aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * you may not use this file except in compliance with the License. 68aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * You may obtain a copy of the License at 765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * 88aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * http://www.apache.org/licenses/LICENSE-2.0 965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * 1065feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * Unless required by applicable law or agreed to in writing, software 118aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * distributed under the License is distributed on an "AS IS" BASIS, 128aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 138aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * See the License for the specific language governing permissions and 148aa9963a895f9dd5bb1bc92ab2e4f461e058f87aTadashi G. Takaoka * limitations under the License. 1565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada */ 1665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 1765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadapackage com.android.inputmethod.latin.makedict; 1865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 1915f6d4ae34664ea3d92827a2c3003198c0bac70bTadashi G. Takaokaimport com.android.inputmethod.annotations.UsedForTesting; 200f7d881dc72132dfd75c8b4fe61a69fc5cdcd460Mohammadinamul Sheikimport com.android.inputmethod.latin.define.DecoderSpecificConstants; 2193cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagiimport com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory; 2265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 2393cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagiimport java.io.File; 2465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadaimport java.io.IOException; 25fb7e08ea8f40ebb4ac57e443f837043e7b57fd87Yuichiro Hanadaimport java.io.OutputStream; 2665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadaimport java.util.ArrayList; 2765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadaimport java.util.Map; 2865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanadaimport java.util.Stack; 2965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 30a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaokapublic final class BinaryDictIOUtils { 3165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada private static final boolean DBG = false; 32c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada 33c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada private BinaryDictIOUtils() { 34c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada // This utility class is not publicly instantiable. 35c3a98ca306d5d6a3dfce3585b73f7431dbf90bfcYuichiro Hanada } 3665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 3793cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi /** 3893cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi * Returns new dictionary decoder. 3993cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi * 4093cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi * @param dictFile the dictionary file. 4193cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi * @param bufferType The type of buffer, as one of USE_* in DictDecoder. 4293cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi * @return new dictionary decoder if the dictionary file exists, otherwise null. 4393cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi */ 4493cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi public static DictDecoder getDictDecoder(final File dictFile, final long offset, 4593cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi final long length, final int bufferType) { 46c15bbb52a37be751fed2ba7e765dfd7727306308Dan Zivkovic return new Ver4DictDecoder(dictFile); 4793cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi } 4893cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi 4993cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi public static DictDecoder getDictDecoder(final File dictFile, final long offset, 5093cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi final long length, final DictionaryBufferFactory factory) { 51c15bbb52a37be751fed2ba7e765dfd7727306308Dan Zivkovic return new Ver4DictDecoder(dictFile); 5293cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi } 5393cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi 5493cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi public static DictDecoder getDictDecoder(final File dictFile, final long offset, 5593cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi final long length) { 5693cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER); 5793cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi } 5893cda5bb396c22f1781e390debaf75d54cf7c0dcKeisuke Kuroyanagi 59a28a05e971cc242b338331a3b78276fa95188d19Tadashi G. Takaoka private static final class Position { 60576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static final int NOT_READ_PTNODE_COUNT = -1; 6165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 6265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada public int mAddress; 63576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public int mNumOfPtNode; 6465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada public int mPosition; 6565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada public int mLength; 6665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 6765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada public Position(int address, int length) { 6865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada mAddress = address; 6965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada mLength = length; 70576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada mNumOfPtNode = NOT_READ_PTNODE_COUNT; 7165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 7265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 7365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 7465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada /** 75af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Retrieves all node arrays without recursive call. 7665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada */ 77be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, 782fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa final int bodyOffset, final Map<Integer, String> words, 79be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada final Map<Integer, Integer> frequencies, 8095d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi final Map<Integer, ArrayList<PendingAttribute>> bigrams) { 8165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; 8265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 83a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka Stack<Position> stack = new Stack<>(); 8465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada int index = 0; 8565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 862fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa Position initPos = new Position(bodyOffset, 0); 8765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada stack.push(initPos); 8865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 8965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada while (!stack.empty()) { 9065feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada Position p = stack.peek(); 9165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 9265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada if (DBG) { 93576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" + 94576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength); 9565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 9665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 97be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress); 9865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada if (index != p.mLength) index = p.mLength; 9965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 100576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) { 101be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada p.mNumOfPtNode = dictDecoder.readPtNodeCount(); 10256e7e38d372c726724c80246e6187dc1241ecfeaKeisuke Kuroyanagi p.mAddress = dictDecoder.getPosition(); 10365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada p.mPosition = 0; 10465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 105576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (p.mNumOfPtNode == 0) { 10684d858ed5e187eb9d4b56b593e1d9287f762bbcaYuichiro Hanada stack.pop(); 10784d858ed5e187eb9d4b56b593e1d9287f762bbcaYuichiro Hanada continue; 10884d858ed5e187eb9d4b56b593e1d9287f762bbcaYuichiro Hanada } 10995d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress); 1108ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) { 1118ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi pushedChars[index++] = ptNodeInfo.mCharacters[i]; 11265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 11365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada p.mPosition++; 11495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi if (ptNodeInfo.isTerminal()) {// found word 1158ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index)); 1168ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi frequencies.put( 1178ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability); 1188ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi if (ptNodeInfo.mBigrams != null) { 1198ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams); 1208ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi } 12165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 12265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 123576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (p.mPosition == p.mNumOfPtNode) { 12495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi stack.pop(); 12565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } else { 12695d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi // The PtNode array has more PtNodes. 127be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada p.mAddress = dictDecoder.getPosition(); 12865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 12965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 13095d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) { 1318ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index); 13265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada stack.push(childrenPos); 13365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 13465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 13565feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 13665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada 13765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada /** 13865feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * Reads unigrams and bigrams from the binary file. 139af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * Doesn't store a full memory representation of the dictionary. 14065feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * 14177bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada * @param dictDecoder the dict decoder. 14265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * @param words the map to store the address as a key and the word as a value. 14365feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * @param frequencies the map to store the address as a key and the frequency as a value. 14465feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada * @param bigrams the map to store the address as a key and the list of address as a value. 145af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @throws IOException if the file can't be read. 146af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @throws UnsupportedFormatException if the format of the file is not recognized. 14765feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada */ 1480e40cd0c40f2c731f91ccd0561e251262e5a2614Yuichiro Hanada /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, 14965feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 15065feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, 15165feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada UnsupportedFormatException { 15265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada // Read header 153b986f78ba826fa360304a69565f1880bdd7ce0c5Keisuke Kuroyanagi final DictionaryHeader header = dictDecoder.readHeader(); 1542fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words, 15595d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi frequencies, bigrams); 15665feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada } 157d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada 158d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada /** 159576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Gets the address of the last PtNode of the exact matching word in the dictionary. 160d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada * If no match is found, returns NOT_VALID_WORD. 161d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada * 16277bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada * @param dictDecoder the dict decoder. 163d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada * @param word the word we search for. 164d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada * @return the address of the terminal node. 165af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @throws IOException if the file can't be read. 166af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard * @throws UnsupportedFormatException if the format of the file is not recognized. 167d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada */ 16815f6d4ae34664ea3d92827a2c3003198c0bac70bTadashi G. Takaoka @UsedForTesting 1690e40cd0c40f2c731f91ccd0561e251262e5a2614Yuichiro Hanada /* package */ static int getTerminalPosition(final DictDecoder dictDecoder, 170d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada final String word) throws IOException, UnsupportedFormatException { 171d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada if (word == null) return FormatSpec.NOT_VALID_WORD; 172be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada dictDecoder.setPosition(0); 17395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi dictDecoder.readHeader(); 174d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada int wordPos = 0; 175d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada final int wordLen = word.codePointCount(0, word.length()); 1760f7d881dc72132dfd75c8b4fe61a69fc5cdcd460Mohammadinamul Sheik for (int depth = 0; depth < DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { 177d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; 17893d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada 17993d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada do { 180be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada final int ptNodeCount = dictDecoder.readPtNodeCount(); 181576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada boolean foundNextPtNode = false; 182576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada for (int i = 0; i < ptNodeCount; ++i) { 183be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada final int ptNodePos = dictDecoder.getPosition(); 18495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos); 18593d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada boolean same = true; 18693d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada for (int p = 0, j = word.offsetByCodePoints(0, wordPos); 18793d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada p < currentInfo.mCharacters.length; 18893d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada ++p, j = word.offsetByCodePoints(j, 1)) { 18993d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada if (wordPos + p >= wordLen 19093d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada || word.codePointAt(j) != currentInfo.mCharacters[p]) { 19193d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada same = false; 19293d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada break; 19393d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada } 194d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada } 195d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada 19693d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada if (same) { 197576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada // found the PtNode matches the word. 19893d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada if (wordPos + currentInfo.mCharacters.length == wordLen) { 1995f00fe09e9a611b647592188316e5999465df4d3Tadashi G. Takaoka return currentInfo.isTerminal() ? ptNodePos : FormatSpec.NOT_VALID_WORD; 20093d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada } 20193d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada wordPos += currentInfo.mCharacters.length; 20293d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 203d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada return FormatSpec.NOT_VALID_WORD; 204d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada } 205576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada foundNextPtNode = true; 206be470f06e48e40a0def32e0f34e3ca48113937b5Yuichiro Hanada dictDecoder.setPosition(currentInfo.mChildrenAddress); 20793d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada break; 208d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada } 209d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada } 210576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (foundNextPtNode) break; 21195d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi return FormatSpec.NOT_VALID_WORD; 21293d7c6233fb6b867d51a9eeb54b951fe3a377ea8Yuichiro Hanada } while(true); 213d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada } 214d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada return FormatSpec.NOT_VALID_WORD; 215d36245fad292ea660ca49f38a3ec36e07727dda5Yuichiro Hanada } 21666597f5e5f3249f418665c1990fb539d2f5565d5Yuichiro Hanada 2177f438aa12f683ac15c8c8e60ca852412d00128dbYuichiro Hanada /** 2182fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * Writes a PtNodeCount to the stream. 2192fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * 2202fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * @param destination the stream to write. 2212fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * @param ptNodeCount the count. 2222fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa * @return the size written in bytes. 2232fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa */ 2244fdcefe504a4f8e832a75be2c7280ea8a5e390d3Keisuke Kuroyanagi @UsedForTesting 2252fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa static int writePtNodeCount(final OutputStream destination, final int ptNodeCount) 2262fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa throws IOException { 2272fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount); 2282fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // the count must fit on one byte or two bytes. 2292fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa // Please see comments in FormatSpec. 2302fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa if (countSize != 1 && countSize != 2) { 2312fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize); 2322fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 2332fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa final int encodedPtNodeCount = (countSize == 2) ? 2342fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount; 2352fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize); 2362fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa return countSize; 2372fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa } 2382fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa 2391db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada /** 2401db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada * Helper method to hide the actual value of the no children address. 2411db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada */ 2421db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada public static boolean hasChildrenAddress(final int address) { 2431db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada return FormatSpec.NO_CHILDREN_ADDRESS != address; 2441db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada } 2451db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada 2461db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada /** 247576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * Compute the binary size of the node count 248576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @param count the node count 249576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada * @return the size of the node count, either 1 or 2 bytes. 2501db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada */ 251576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada public static int getPtNodeCountSize(final int count) { 252576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) { 2531db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada return 1; 254576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) { 2551db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada return 2; 2561db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada } else { 2571db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada throw new RuntimeException("Can't have more than " 258576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found " 259af30cbf0ee8370763edf22822ea34a282e882084Jean Chalard + count + ")"); 2601db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada } 2611db93c9c0420d7d944e0ddef95d25de0738c3030Yuichiro Hanada } 26294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada 26395d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi static int getChildrenAddressSize(final int optionFlags) { 264576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { 265576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: 26694460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 1; 267576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: 26894460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 2; 269576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: 27094460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 3; 271576f625ee1b22e26baab46cc4ad3138e901383e2Yuichiro Hanada case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: 27294460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada default: 27394460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada return 0; 27494460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 27594460eba11019ec4658c42b4bcc0379d70f41770Yuichiro Hanada } 27677bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada 27777bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada /** 27877bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada * Calculate bigram frequency from compressed value 27977bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada * 28077bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada * @param unigramFrequency 28177bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada * @param bigramFrequency compressed frequency 28277bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada * @return approximate bigram frequency 28377bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada */ 28495d16561e0e6c38dbd99c893f09c5dbe9d4a465dKeisuke Kuroyanagi @UsedForTesting 28577bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada public static int reconstructBigramFrequency(final int unigramFrequency, 28677bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada final int bigramFrequency) { 28777bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) 28877bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); 28977bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f); 29077bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada return (int)resultFreqFloat; 29177bce05e6f6e3a988253f9305ae22e51f56f5b1aYuichiro Hanada } 29265feee12e5889601e375d92dfdf5f8e8fbb05092Yuichiro Hanada} 293