1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.latin.Constants;
20import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
21import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
22import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
23import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
24
25import java.io.IOException;
26import java.util.ArrayList;
27import java.util.Map;
28import java.util.Stack;
29
30public final class BinaryDictIOUtils {
31    private static final boolean DBG = false;
32
33    private static final class Position {
34        public static final int NOT_READ_GROUPCOUNT = -1;
35
36        public int mAddress;
37        public int mNumOfCharGroup;
38        public int mPosition;
39        public int mLength;
40
41        public Position(int address, int length) {
42            mAddress = address;
43            mLength = length;
44            mNumOfCharGroup = NOT_READ_GROUPCOUNT;
45        }
46    }
47
48    /**
49     * Tours all node without recursive call.
50     */
51    private static void readUnigramsAndBigramsBinaryInner(
52            final FusionDictionaryBufferInterface buffer, final int headerSize,
53            final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
54            final Map<Integer, ArrayList<PendingAttribute>> bigrams,
55            final FormatOptions formatOptions) {
56        int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
57
58        Stack<Position> stack = new Stack<Position>();
59        int index = 0;
60
61        Position initPos = new Position(headerSize, 0);
62        stack.push(initPos);
63
64        while (!stack.empty()) {
65            Position p = stack.peek();
66
67            if (DBG) {
68                MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" +
69                        p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength);
70            }
71
72            if (buffer.position() != p.mAddress) buffer.position(p.mAddress);
73            if (index != p.mLength) index = p.mLength;
74
75            if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) {
76                p.mNumOfCharGroup = BinaryDictInputOutput.readCharGroupCount(buffer);
77                p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup);
78                p.mPosition = 0;
79            }
80            if (p.mNumOfCharGroup == 0) {
81                stack.pop();
82                continue;
83            }
84            CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer,
85                    p.mAddress - headerSize, formatOptions);
86            for (int i = 0; i < info.mCharacters.length; ++i) {
87                pushedChars[index++] = info.mCharacters[i];
88            }
89            p.mPosition++;
90
91            final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags,
92                    formatOptions);
93            if (!isMovedGroup
94                    && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word
95                words.put(info.mOriginalAddress, new String(pushedChars, 0, index));
96                frequencies.put(info.mOriginalAddress, info.mFrequency);
97                if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams);
98            }
99
100            if (p.mPosition == p.mNumOfCharGroup) {
101                if (formatOptions.mSupportsDynamicUpdate) {
102                    final int forwardLinkAddress = buffer.readUnsignedInt24();
103                    if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
104                        // the node has a forward link.
105                        p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT;
106                        p.mAddress = forwardLinkAddress;
107                    } else {
108                        stack.pop();
109                    }
110                } else {
111                    stack.pop();
112                }
113            } else {
114                // the node has more groups.
115                p.mAddress = buffer.position();
116            }
117
118            if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) {
119                Position childrenPos = new Position(info.mChildrenAddress + headerSize, index);
120                stack.push(childrenPos);
121            }
122        }
123    }
124
125    /**
126     * Reads unigrams and bigrams from the binary file.
127     * Doesn't make the memory representation of the dictionary.
128     *
129     * @param buffer the buffer to read.
130     * @param words the map to store the address as a key and the word as a value.
131     * @param frequencies the map to store the address as a key and the frequency as a value.
132     * @param bigrams the map to store the address as a key and the list of address as a value.
133     * @throws IOException
134     * @throws UnsupportedFormatException
135     */
136    public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer,
137            final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
138            final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
139            UnsupportedFormatException {
140        // Read header
141        final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
142        readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams,
143                header.mFormatOptions);
144    }
145
146    /**
147     * Gets the address of the last CharGroup of the exact matching word in the dictionary.
148     * If no match is found, returns NOT_VALID_WORD.
149     *
150     * @param buffer the buffer to read.
151     * @param word the word we search for.
152     * @return the address of the terminal node.
153     * @throws IOException
154     * @throws UnsupportedFormatException
155     */
156    public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer,
157            final String word) throws IOException, UnsupportedFormatException {
158        if (word == null) return FormatSpec.NOT_VALID_WORD;
159        if (buffer.position() != 0) buffer.position(0);
160
161        final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
162        int wordPos = 0;
163        final int wordLen = word.codePointCount(0, word.length());
164        for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
165            if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
166
167            do {
168                int groupOffset = buffer.position() - header.mHeaderSize;
169                final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
170                groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount);
171
172                boolean foundNextCharGroup = false;
173                for (int i = 0; i < charGroupCount; ++i) {
174                    final int charGroupPos = buffer.position();
175                    final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
176                            buffer.position(), header.mFormatOptions);
177                    if (BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
178                            header.mFormatOptions)) {
179                        continue;
180                    }
181                    boolean same = true;
182                    for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
183                            p < currentInfo.mCharacters.length;
184                            ++p, j = word.offsetByCodePoints(j, 1)) {
185                        if (wordPos + p >= wordLen
186                                || word.codePointAt(j) != currentInfo.mCharacters[p]) {
187                            same = false;
188                            break;
189                        }
190                    }
191
192                    if (same) {
193                        // found the group matches the word.
194                        if (wordPos + currentInfo.mCharacters.length == wordLen) {
195                            if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) {
196                                return FormatSpec.NOT_VALID_WORD;
197                            } else {
198                                return charGroupPos;
199                            }
200                        }
201                        wordPos += currentInfo.mCharacters.length;
202                        if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
203                            return FormatSpec.NOT_VALID_WORD;
204                        }
205                        foundNextCharGroup = true;
206                        buffer.position(currentInfo.mChildrenAddress);
207                        break;
208                    }
209                    groupOffset = currentInfo.mEndAddress;
210                }
211
212                // If we found the next char group, it is under the file pointer.
213                // But if not, we are at the end of this node so we expect to have
214                // a forward link address that we need to consult and possibly resume
215                // search on the next node in the linked list.
216                if (foundNextCharGroup) break;
217                if (!header.mFormatOptions.mSupportsDynamicUpdate) {
218                    return FormatSpec.NOT_VALID_WORD;
219                }
220
221                final int forwardLinkAddress = buffer.readUnsignedInt24();
222                if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
223                    return FormatSpec.NOT_VALID_WORD;
224                }
225                buffer.position(forwardLinkAddress);
226            } while(true);
227        }
228        return FormatSpec.NOT_VALID_WORD;
229    }
230
231    /**
232     * Delete the word from the binary file.
233     *
234     * @param buffer the buffer to write.
235     * @param word the word we delete
236     * @throws IOException
237     * @throws UnsupportedFormatException
238     */
239    public static void deleteWord(final FusionDictionaryBufferInterface buffer,
240            final String word) throws IOException, UnsupportedFormatException {
241        buffer.position(0);
242        final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
243        final int wordPosition = getTerminalPosition(buffer, word);
244        if (wordPosition == FormatSpec.NOT_VALID_WORD) return;
245
246        buffer.position(wordPosition);
247        final int flags = buffer.readUnsignedByte();
248        final int newFlags = flags ^ FormatSpec.FLAG_IS_TERMINAL;
249        buffer.position(wordPosition);
250        buffer.put((byte)newFlags);
251    }
252
253    private static void putSInt24(final FusionDictionaryBufferInterface buffer,
254            final int value) {
255        final int absValue = Math.abs(value);
256        buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
257        buffer.put((byte)((absValue >> 8) & 0xFF));
258        buffer.put((byte)(absValue & 0xFF));
259    }
260
261    /**
262     * Update a parent address in a CharGroup that is addressed by groupOriginAddress.
263     *
264     * @param buffer the buffer to write.
265     * @param groupOriginAddress the address of the group.
266     * @param newParentAddress the absolute address of the parent.
267     * @param formatOptions file format options.
268     */
269    public static void updateParentAddress(final FusionDictionaryBufferInterface buffer,
270            final int groupOriginAddress, final int newParentAddress,
271            final FormatOptions formatOptions) {
272        final int originalPosition = buffer.position();
273        buffer.position(groupOriginAddress);
274        if (!formatOptions.mSupportsDynamicUpdate) {
275            throw new RuntimeException("this file format does not support parent addresses");
276        }
277        final int flags = buffer.readUnsignedByte();
278        final int parentOffset = newParentAddress - groupOriginAddress;
279        putSInt24(buffer, parentOffset);
280        buffer.position(originalPosition);
281    }
282}
283