1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.annotations.UsedForTesting;
20import com.android.inputmethod.latin.define.DecoderSpecificConstants;
21import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
22
23import java.io.File;
24import java.io.IOException;
25import java.io.OutputStream;
26import java.util.ArrayList;
27import java.util.Map;
28import java.util.Stack;
29
30public final class BinaryDictIOUtils {
31    private static final boolean DBG = false;
32
33    private BinaryDictIOUtils() {
34        // This utility class is not publicly instantiable.
35    }
36
37    /**
38     * Returns new dictionary decoder.
39     *
40     * @param dictFile the dictionary file.
41     * @param bufferType The type of buffer, as one of USE_* in DictDecoder.
42     * @return new dictionary decoder if the dictionary file exists, otherwise null.
43     */
44    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
45            final long length, final int bufferType) {
46        return new Ver4DictDecoder(dictFile);
47    }
48
49    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
50            final long length, final DictionaryBufferFactory factory) {
51        return new Ver4DictDecoder(dictFile);
52    }
53
54    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
55            final long length) {
56        return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER);
57    }
58
59    private static final class Position {
60        public static final int NOT_READ_PTNODE_COUNT = -1;
61
62        public int mAddress;
63        public int mNumOfPtNode;
64        public int mPosition;
65        public int mLength;
66
67        public Position(int address, int length) {
68            mAddress = address;
69            mLength = length;
70            mNumOfPtNode = NOT_READ_PTNODE_COUNT;
71        }
72    }
73
74    /**
75     * Retrieves all node arrays without recursive call.
76     */
77    private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
78            final int bodyOffset, final Map<Integer, String> words,
79            final Map<Integer, Integer> frequencies,
80            final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
81        int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
82
83        Stack<Position> stack = new Stack<>();
84        int index = 0;
85
86        Position initPos = new Position(bodyOffset, 0);
87        stack.push(initPos);
88
89        while (!stack.empty()) {
90            Position p = stack.peek();
91
92            if (DBG) {
93                MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
94                        p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
95            }
96
97            if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress);
98            if (index != p.mLength) index = p.mLength;
99
100            if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
101                p.mNumOfPtNode = dictDecoder.readPtNodeCount();
102                p.mAddress = dictDecoder.getPosition();
103                p.mPosition = 0;
104            }
105            if (p.mNumOfPtNode == 0) {
106                stack.pop();
107                continue;
108            }
109            final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress);
110            for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
111                pushedChars[index++] = ptNodeInfo.mCharacters[i];
112            }
113            p.mPosition++;
114            if (ptNodeInfo.isTerminal()) {// found word
115                words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
116                frequencies.put(
117                        ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
118                if (ptNodeInfo.mBigrams != null) {
119                    bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
120                }
121            }
122
123            if (p.mPosition == p.mNumOfPtNode) {
124                stack.pop();
125            } else {
126                // The PtNode array has more PtNodes.
127                p.mAddress = dictDecoder.getPosition();
128            }
129
130            if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
131                final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
132                stack.push(childrenPos);
133            }
134        }
135    }
136
137    /**
138     * Reads unigrams and bigrams from the binary file.
139     * Doesn't store a full memory representation of the dictionary.
140     *
141     * @param dictDecoder the dict decoder.
142     * @param words the map to store the address as a key and the word as a value.
143     * @param frequencies the map to store the address as a key and the frequency as a value.
144     * @param bigrams the map to store the address as a key and the list of address as a value.
145     * @throws IOException if the file can't be read.
146     * @throws UnsupportedFormatException if the format of the file is not recognized.
147     */
148    /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder,
149            final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
150            final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
151            UnsupportedFormatException {
152        // Read header
153        final DictionaryHeader header = dictDecoder.readHeader();
154        readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
155            frequencies, bigrams);
156    }
157
158    /**
159     * Gets the address of the last PtNode of the exact matching word in the dictionary.
160     * If no match is found, returns NOT_VALID_WORD.
161     *
162     * @param dictDecoder the dict decoder.
163     * @param word the word we search for.
164     * @return the address of the terminal node.
165     * @throws IOException if the file can't be read.
166     * @throws UnsupportedFormatException if the format of the file is not recognized.
167     */
168    @UsedForTesting
169    /* package */ static int getTerminalPosition(final DictDecoder dictDecoder,
170            final String word) throws IOException, UnsupportedFormatException {
171        if (word == null) return FormatSpec.NOT_VALID_WORD;
172        dictDecoder.setPosition(0);
173        dictDecoder.readHeader();
174        int wordPos = 0;
175        final int wordLen = word.codePointCount(0, word.length());
176        for (int depth = 0; depth < DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH; ++depth) {
177            if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
178
179            do {
180                final int ptNodeCount = dictDecoder.readPtNodeCount();
181                boolean foundNextPtNode = false;
182                for (int i = 0; i < ptNodeCount; ++i) {
183                    final int ptNodePos = dictDecoder.getPosition();
184                    final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos);
185                    boolean same = true;
186                    for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
187                            p < currentInfo.mCharacters.length;
188                            ++p, j = word.offsetByCodePoints(j, 1)) {
189                        if (wordPos + p >= wordLen
190                                || word.codePointAt(j) != currentInfo.mCharacters[p]) {
191                            same = false;
192                            break;
193                        }
194                    }
195
196                    if (same) {
197                        // found the PtNode matches the word.
198                        if (wordPos + currentInfo.mCharacters.length == wordLen) {
199                            return currentInfo.isTerminal() ? ptNodePos : FormatSpec.NOT_VALID_WORD;
200                        }
201                        wordPos += currentInfo.mCharacters.length;
202                        if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
203                            return FormatSpec.NOT_VALID_WORD;
204                        }
205                        foundNextPtNode = true;
206                        dictDecoder.setPosition(currentInfo.mChildrenAddress);
207                        break;
208                    }
209                }
210                if (foundNextPtNode) break;
211                return FormatSpec.NOT_VALID_WORD;
212            } while(true);
213        }
214        return FormatSpec.NOT_VALID_WORD;
215    }
216
217    /**
218     * Writes a PtNodeCount to the stream.
219     *
220     * @param destination the stream to write.
221     * @param ptNodeCount the count.
222     * @return the size written in bytes.
223     */
224    @UsedForTesting
225    static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
226            throws IOException {
227        final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
228        // the count must fit on one byte or two bytes.
229        // Please see comments in FormatSpec.
230        if (countSize != 1 && countSize != 2) {
231            throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
232        }
233        final int encodedPtNodeCount = (countSize == 2) ?
234                (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
235        BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
236        return countSize;
237    }
238
239    /**
240     * Helper method to hide the actual value of the no children address.
241     */
242    public static boolean hasChildrenAddress(final int address) {
243        return FormatSpec.NO_CHILDREN_ADDRESS != address;
244    }
245
246    /**
247     * Compute the binary size of the node count
248     * @param count the node count
249     * @return the size of the node count, either 1 or 2 bytes.
250     */
251    public static int getPtNodeCountSize(final int count) {
252        if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
253            return 1;
254        } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
255            return 2;
256        } else {
257            throw new RuntimeException("Can't have more than "
258                    + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
259                    + count + ")");
260        }
261    }
262
263    static int getChildrenAddressSize(final int optionFlags) {
264        switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
265            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
266                return 1;
267            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
268                return 2;
269            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
270                return 3;
271            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
272            default:
273                return 0;
274        }
275    }
276
277    /**
278     * Calculate bigram frequency from compressed value
279     *
280     * @param unigramFrequency
281     * @param bigramFrequency compressed frequency
282     * @return approximate bigram frequency
283     */
284    @UsedForTesting
285    public static int reconstructBigramFrequency(final int unigramFrequency,
286            final int bigramFrequency) {
287        final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
288                / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
289        final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
290        return (int)resultFreqFloat;
291    }
292}
293