1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.annotations.UsedForTesting;
20import com.android.inputmethod.latin.Constants;
21import com.android.inputmethod.latin.makedict.DictDecoder.DictionaryBufferFactory;
22
23import java.io.File;
24import java.io.IOException;
25import java.io.OutputStream;
26import java.util.ArrayList;
27import java.util.Map;
28import java.util.Stack;
29
30public final class BinaryDictIOUtils {
31    private static final boolean DBG = false;
32
33    private BinaryDictIOUtils() {
34        // This utility class is not publicly instantiable.
35    }
36
37    /**
38     * Returns new dictionary decoder.
39     *
40     * @param dictFile the dictionary file.
41     * @param bufferType The type of buffer, as one of USE_* in DictDecoder.
42     * @return new dictionary decoder if the dictionary file exists, otherwise null.
43     */
44    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
45            final long length, final int bufferType) {
46        if (dictFile.isDirectory()) {
47            return new Ver4DictDecoder(dictFile, bufferType);
48        } else if (dictFile.isFile()) {
49            return new Ver2DictDecoder(dictFile, offset, length, bufferType);
50        }
51        return null;
52    }
53
54    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
55            final long length, final DictionaryBufferFactory factory) {
56        if (dictFile.isDirectory()) {
57            return new Ver4DictDecoder(dictFile, factory);
58        } else if (dictFile.isFile()) {
59            return new Ver2DictDecoder(dictFile, offset, length, factory);
60        }
61        return null;
62    }
63
64    public static DictDecoder getDictDecoder(final File dictFile, final long offset,
65            final long length) {
66        return getDictDecoder(dictFile, offset, length, DictDecoder.USE_READONLY_BYTEBUFFER);
67    }
68
69    private static final class Position {
70        public static final int NOT_READ_PTNODE_COUNT = -1;
71
72        public int mAddress;
73        public int mNumOfPtNode;
74        public int mPosition;
75        public int mLength;
76
77        public Position(int address, int length) {
78            mAddress = address;
79            mLength = length;
80            mNumOfPtNode = NOT_READ_PTNODE_COUNT;
81        }
82    }
83
84    /**
85     * Retrieves all node arrays without recursive call.
86     */
87    private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder,
88            final int bodyOffset, final Map<Integer, String> words,
89            final Map<Integer, Integer> frequencies,
90            final Map<Integer, ArrayList<PendingAttribute>> bigrams) {
91        int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
92
93        Stack<Position> stack = new Stack<>();
94        int index = 0;
95
96        Position initPos = new Position(bodyOffset, 0);
97        stack.push(initPos);
98
99        while (!stack.empty()) {
100            Position p = stack.peek();
101
102            if (DBG) {
103                MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" +
104                        p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength);
105            }
106
107            if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress);
108            if (index != p.mLength) index = p.mLength;
109
110            if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) {
111                p.mNumOfPtNode = dictDecoder.readPtNodeCount();
112                p.mAddress = dictDecoder.getPosition();
113                p.mPosition = 0;
114            }
115            if (p.mNumOfPtNode == 0) {
116                stack.pop();
117                continue;
118            }
119            final PtNodeInfo ptNodeInfo = dictDecoder.readPtNode(p.mAddress);
120            for (int i = 0; i < ptNodeInfo.mCharacters.length; ++i) {
121                pushedChars[index++] = ptNodeInfo.mCharacters[i];
122            }
123            p.mPosition++;
124            if (ptNodeInfo.isTerminal()) {// found word
125                words.put(ptNodeInfo.mOriginalAddress, new String(pushedChars, 0, index));
126                frequencies.put(
127                        ptNodeInfo.mOriginalAddress, ptNodeInfo.mProbabilityInfo.mProbability);
128                if (ptNodeInfo.mBigrams != null) {
129                    bigrams.put(ptNodeInfo.mOriginalAddress, ptNodeInfo.mBigrams);
130                }
131            }
132
133            if (p.mPosition == p.mNumOfPtNode) {
134                stack.pop();
135            } else {
136                // The PtNode array has more PtNodes.
137                p.mAddress = dictDecoder.getPosition();
138            }
139
140            if (hasChildrenAddress(ptNodeInfo.mChildrenAddress)) {
141                final Position childrenPos = new Position(ptNodeInfo.mChildrenAddress, index);
142                stack.push(childrenPos);
143            }
144        }
145    }
146
147    /**
148     * Reads unigrams and bigrams from the binary file.
149     * Doesn't store a full memory representation of the dictionary.
150     *
151     * @param dictDecoder the dict decoder.
152     * @param words the map to store the address as a key and the word as a value.
153     * @param frequencies the map to store the address as a key and the frequency as a value.
154     * @param bigrams the map to store the address as a key and the list of address as a value.
155     * @throws IOException if the file can't be read.
156     * @throws UnsupportedFormatException if the format of the file is not recognized.
157     */
158    /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder,
159            final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
160            final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
161            UnsupportedFormatException {
162        // Read header
163        final DictionaryHeader header = dictDecoder.readHeader();
164        readUnigramsAndBigramsBinaryInner(dictDecoder, header.mBodyOffset, words,
165            frequencies, bigrams);
166    }
167
168    /**
169     * Gets the address of the last PtNode of the exact matching word in the dictionary.
170     * If no match is found, returns NOT_VALID_WORD.
171     *
172     * @param dictDecoder the dict decoder.
173     * @param word the word we search for.
174     * @return the address of the terminal node.
175     * @throws IOException if the file can't be read.
176     * @throws UnsupportedFormatException if the format of the file is not recognized.
177     */
178    @UsedForTesting
179    /* package */ static int getTerminalPosition(final DictDecoder dictDecoder,
180            final String word) throws IOException, UnsupportedFormatException {
181        if (word == null) return FormatSpec.NOT_VALID_WORD;
182        dictDecoder.setPosition(0);
183        dictDecoder.readHeader();
184        int wordPos = 0;
185        final int wordLen = word.codePointCount(0, word.length());
186        for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) {
187            if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
188
189            do {
190                final int ptNodeCount = dictDecoder.readPtNodeCount();
191                boolean foundNextPtNode = false;
192                for (int i = 0; i < ptNodeCount; ++i) {
193                    final int ptNodePos = dictDecoder.getPosition();
194                    final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos);
195                    boolean same = true;
196                    for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
197                            p < currentInfo.mCharacters.length;
198                            ++p, j = word.offsetByCodePoints(j, 1)) {
199                        if (wordPos + p >= wordLen
200                                || word.codePointAt(j) != currentInfo.mCharacters[p]) {
201                            same = false;
202                            break;
203                        }
204                    }
205
206                    if (same) {
207                        // found the PtNode matches the word.
208                        if (wordPos + currentInfo.mCharacters.length == wordLen) {
209                            if (!currentInfo.isTerminal()) {
210                                return FormatSpec.NOT_VALID_WORD;
211                            } else {
212                                return ptNodePos;
213                            }
214                        }
215                        wordPos += currentInfo.mCharacters.length;
216                        if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
217                            return FormatSpec.NOT_VALID_WORD;
218                        }
219                        foundNextPtNode = true;
220                        dictDecoder.setPosition(currentInfo.mChildrenAddress);
221                        break;
222                    }
223                }
224                if (foundNextPtNode) break;
225                return FormatSpec.NOT_VALID_WORD;
226            } while(true);
227        }
228        return FormatSpec.NOT_VALID_WORD;
229    }
230
231    /**
232     * Writes a PtNodeCount to the stream.
233     *
234     * @param destination the stream to write.
235     * @param ptNodeCount the count.
236     * @return the size written in bytes.
237     */
238    @UsedForTesting
239    static int writePtNodeCount(final OutputStream destination, final int ptNodeCount)
240            throws IOException {
241        final int countSize = BinaryDictIOUtils.getPtNodeCountSize(ptNodeCount);
242        // the count must fit on one byte or two bytes.
243        // Please see comments in FormatSpec.
244        if (countSize != 1 && countSize != 2) {
245            throw new RuntimeException("Strange size from getPtNodeCountSize : " + countSize);
246        }
247        final int encodedPtNodeCount = (countSize == 2) ?
248                (ptNodeCount | FormatSpec.LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG) : ptNodeCount;
249        BinaryDictEncoderUtils.writeUIntToStream(destination, encodedPtNodeCount, countSize);
250        return countSize;
251    }
252
253    /**
254     * Helper method to hide the actual value of the no children address.
255     */
256    public static boolean hasChildrenAddress(final int address) {
257        return FormatSpec.NO_CHILDREN_ADDRESS != address;
258    }
259
260    /**
261     * Compute the binary size of the node count
262     * @param count the node count
263     * @return the size of the node count, either 1 or 2 bytes.
264     */
265    public static int getPtNodeCountSize(final int count) {
266        if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) {
267            return 1;
268        } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) {
269            return 2;
270        } else {
271            throw new RuntimeException("Can't have more than "
272                    + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found "
273                    + count + ")");
274        }
275    }
276
277    static int getChildrenAddressSize(final int optionFlags) {
278        switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
279            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
280                return 1;
281            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
282                return 2;
283            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
284                return 3;
285            case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
286            default:
287                return 0;
288        }
289    }
290
291    /**
292     * Calculate bigram frequency from compressed value
293     *
294     * @param unigramFrequency
295     * @param bigramFrequency compressed frequency
296     * @return approximate bigram frequency
297     */
298    @UsedForTesting
299    public static int reconstructBigramFrequency(final int unigramFrequency,
300            final int bigramFrequency) {
301        final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
302                / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
303        final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f);
304        return (int)resultFreqFloat;
305    }
306}
307