BinaryDictIOUtils.java revision 7f438aa12f683ac15c8c8e60ca852412d00128db
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.latin.Constants;
20import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding;
21import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
22import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
23import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
24import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
25import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
26
27import java.io.IOException;
28import java.io.OutputStream;
29import java.util.ArrayList;
30import java.util.Iterator;
31import java.util.List;
32import java.util.Map;
33import java.util.Stack;
34
35public final class BinaryDictIOUtils {
36    private static final boolean DBG = false;
37
38    private static final class Position {
39        public static final int NOT_READ_GROUPCOUNT = -1;
40
41        public int mAddress;
42        public int mNumOfCharGroup;
43        public int mPosition;
44        public int mLength;
45
46        public Position(int address, int length) {
47            mAddress = address;
48            mLength = length;
49            mNumOfCharGroup = NOT_READ_GROUPCOUNT;
50        }
51    }
52
53    /**
54     * Tours all node without recursive call.
55     */
56    private static void readUnigramsAndBigramsBinaryInner(
57            final FusionDictionaryBufferInterface buffer, final int headerSize,
58            final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
59            final Map<Integer, ArrayList<PendingAttribute>> bigrams,
60            final FormatOptions formatOptions) {
61        int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
62
63        Stack<Position> stack = new Stack<Position>();
64        int index = 0;
65
66        Position initPos = new Position(headerSize, 0);
67        stack.push(initPos);
68
69        while (!stack.empty()) {
70            Position p = stack.peek();
71
72            if (DBG) {
73                MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" +
74                        p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength);
75            }
76
77            if (buffer.position() != p.mAddress) buffer.position(p.mAddress);
78            if (index != p.mLength) index = p.mLength;
79
80            if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) {
81                p.mNumOfCharGroup = BinaryDictInputOutput.readCharGroupCount(buffer);
82                p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup);
83                p.mPosition = 0;
84            }
85            if (p.mNumOfCharGroup == 0) {
86                stack.pop();
87                continue;
88            }
89            CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer,
90                    p.mAddress - headerSize, formatOptions);
91            for (int i = 0; i < info.mCharacters.length; ++i) {
92                pushedChars[index++] = info.mCharacters[i];
93            }
94            p.mPosition++;
95
96            final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags,
97                    formatOptions);
98            if (!isMovedGroup
99                    && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word
100                words.put(info.mOriginalAddress, new String(pushedChars, 0, index));
101                frequencies.put(info.mOriginalAddress, info.mFrequency);
102                if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams);
103            }
104
105            if (p.mPosition == p.mNumOfCharGroup) {
106                if (formatOptions.mSupportsDynamicUpdate) {
107                    final int forwardLinkAddress = buffer.readUnsignedInt24();
108                    if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
109                        // the node has a forward link.
110                        p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT;
111                        p.mAddress = forwardLinkAddress;
112                    } else {
113                        stack.pop();
114                    }
115                } else {
116                    stack.pop();
117                }
118            } else {
119                // the node has more groups.
120                p.mAddress = buffer.position();
121            }
122
123            if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) {
124                Position childrenPos = new Position(info.mChildrenAddress + headerSize, index);
125                stack.push(childrenPos);
126            }
127        }
128    }
129
130    /**
131     * Reads unigrams and bigrams from the binary file.
132     * Doesn't make the memory representation of the dictionary.
133     *
134     * @param buffer the buffer to read.
135     * @param words the map to store the address as a key and the word as a value.
136     * @param frequencies the map to store the address as a key and the frequency as a value.
137     * @param bigrams the map to store the address as a key and the list of address as a value.
138     * @throws IOException
139     * @throws UnsupportedFormatException
140     */
141    public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer,
142            final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
143            final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
144            UnsupportedFormatException {
145        // Read header
146        final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
147        readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams,
148                header.mFormatOptions);
149    }
150
151    /**
152     * Gets the address of the last CharGroup of the exact matching word in the dictionary.
153     * If no match is found, returns NOT_VALID_WORD.
154     *
155     * @param buffer the buffer to read.
156     * @param word the word we search for.
157     * @return the address of the terminal node.
158     * @throws IOException
159     * @throws UnsupportedFormatException
160     */
161    public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer,
162            final String word) throws IOException, UnsupportedFormatException {
163        if (word == null) return FormatSpec.NOT_VALID_WORD;
164        if (buffer.position() != 0) buffer.position(0);
165
166        final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
167        int wordPos = 0;
168        final int wordLen = word.codePointCount(0, word.length());
169        for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
170            if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
171
172            do {
173                int groupOffset = buffer.position() - header.mHeaderSize;
174                final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
175                groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount);
176
177                boolean foundNextCharGroup = false;
178                for (int i = 0; i < charGroupCount; ++i) {
179                    final int charGroupPos = buffer.position();
180                    final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
181                            buffer.position(), header.mFormatOptions);
182                    if (BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
183                            header.mFormatOptions)) {
184                        continue;
185                    }
186                    boolean same = true;
187                    for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
188                            p < currentInfo.mCharacters.length;
189                            ++p, j = word.offsetByCodePoints(j, 1)) {
190                        if (wordPos + p >= wordLen
191                                || word.codePointAt(j) != currentInfo.mCharacters[p]) {
192                            same = false;
193                            break;
194                        }
195                    }
196
197                    if (same) {
198                        // found the group matches the word.
199                        if (wordPos + currentInfo.mCharacters.length == wordLen) {
200                            if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) {
201                                return FormatSpec.NOT_VALID_WORD;
202                            } else {
203                                return charGroupPos;
204                            }
205                        }
206                        wordPos += currentInfo.mCharacters.length;
207                        if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
208                            return FormatSpec.NOT_VALID_WORD;
209                        }
210                        foundNextCharGroup = true;
211                        buffer.position(currentInfo.mChildrenAddress);
212                        break;
213                    }
214                    groupOffset = currentInfo.mEndAddress;
215                }
216
217                // If we found the next char group, it is under the file pointer.
218                // But if not, we are at the end of this node so we expect to have
219                // a forward link address that we need to consult and possibly resume
220                // search on the next node in the linked list.
221                if (foundNextCharGroup) break;
222                if (!header.mFormatOptions.mSupportsDynamicUpdate) {
223                    return FormatSpec.NOT_VALID_WORD;
224                }
225
226                final int forwardLinkAddress = buffer.readUnsignedInt24();
227                if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
228                    return FormatSpec.NOT_VALID_WORD;
229                }
230                buffer.position(forwardLinkAddress);
231            } while(true);
232        }
233        return FormatSpec.NOT_VALID_WORD;
234    }
235
236    /**
237     * Delete the word from the binary file.
238     *
239     * @param buffer the buffer to write.
240     * @param word the word we delete
241     * @throws IOException
242     * @throws UnsupportedFormatException
243     */
244    public static void deleteWord(final FusionDictionaryBufferInterface buffer,
245            final String word) throws IOException, UnsupportedFormatException {
246        buffer.position(0);
247        final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
248        final int wordPosition = getTerminalPosition(buffer, word);
249        if (wordPosition == FormatSpec.NOT_VALID_WORD) return;
250
251        buffer.position(wordPosition);
252        final int flags = buffer.readUnsignedByte();
253        final int newFlags = flags ^ FormatSpec.FLAG_IS_TERMINAL;
254        buffer.position(wordPosition);
255        buffer.put((byte)newFlags);
256    }
257
258    /**
259     * @return the size written, in bytes. Always 3 bytes.
260     */
261    private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer,
262            final int value) {
263        final int absValue = Math.abs(value);
264        buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
265        buffer.put((byte)((absValue >> 8) & 0xFF));
266        buffer.put((byte)(absValue & 0xFF));
267        return 3;
268    }
269
270    /**
271     * @return the size written, in bytes. Always 3 bytes.
272     */
273    private static int writeSInt24ToStream(final OutputStream destination, final int value)
274            throws IOException {
275        final int absValue = Math.abs(value);
276        destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
277        destination.write((byte)((absValue >> 8) & 0xFF));
278        destination.write((byte)(absValue & 0xFF));
279        return 3;
280    }
281
282    /**
283     * @return the size written, in bytes. 1, 2, or 3 bytes.
284     */
285    private static int writeVariableAddress(final OutputStream destination, final int value)
286            throws IOException {
287        switch (BinaryDictInputOutput.getByteSize(value)) {
288        case 1:
289            destination.write((byte)value);
290            break;
291        case 2:
292            destination.write((byte)(0xFF & (value >> 8)));
293            destination.write((byte)(0xFF & value));
294            break;
295        case 3:
296            destination.write((byte)(0xFF & (value >> 16)));
297            destination.write((byte)(0xFF & (value >> 8)));
298            destination.write((byte)(0xFF & value));
299            break;
300        }
301        return BinaryDictInputOutput.getByteSize(value);
302    }
303
304    /**
305     * Update a parent address in a CharGroup that is addressed by groupOriginAddress.
306     *
307     * @param buffer the buffer to write.
308     * @param groupOriginAddress the address of the group.
309     * @param newParentAddress the absolute address of the parent.
310     * @param formatOptions file format options.
311     */
312    public static void updateParentAddress(final FusionDictionaryBufferInterface buffer,
313            final int groupOriginAddress, final int newParentAddress,
314            final FormatOptions formatOptions) {
315        final int originalPosition = buffer.position();
316        buffer.position(groupOriginAddress);
317        if (!formatOptions.mSupportsDynamicUpdate) {
318            throw new RuntimeException("this file format does not support parent addresses");
319        }
320        final int flags = buffer.readUnsignedByte();
321        final int parentOffset = newParentAddress - groupOriginAddress;
322        writeSInt24ToBuffer(buffer, parentOffset);
323        buffer.position(originalPosition);
324    }
325
326    private static void skipString(final FusionDictionaryBufferInterface buffer,
327            final boolean hasMultipleChars) {
328        if (hasMultipleChars) {
329            int character = CharEncoding.readChar(buffer);
330            while (character != FormatSpec.INVALID_CHARACTER) {
331                character = CharEncoding.readChar(buffer);
332            }
333        } else {
334            CharEncoding.readChar(buffer);
335        }
336    }
337
338    /**
339     * Write a string to a stream.
340     *
341     * @param destination the stream to write.
342     * @param word the string to be written.
343     * @return the size written, in bytes.
344     * @throws IOException
345     */
346    private static int writeString(final OutputStream destination, final String word)
347            throws IOException {
348        int size = 0;
349        final int length = word.length();
350        for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
351            final int codePoint = word.codePointAt(i);
352            if (CharEncoding.getCharSize(codePoint) == 1) {
353                destination.write((byte)codePoint);
354                size++;
355            } else {
356                destination.write((byte)(0xFF & (codePoint >> 16)));
357                destination.write((byte)(0xFF & (codePoint >> 8)));
358                destination.write((byte)(0xFF & codePoint));
359                size += 3;
360            }
361        }
362        destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
363        size++;
364        return size;
365    }
366
367    /**
368     * Update a children address in a CharGroup that is addressed by groupOriginAddress.
369     *
370     * @param buffer the buffer to write.
371     * @param groupOriginAddress the address of the group.
372     * @param newChildrenAddress the absolute address of the child.
373     * @param formatOptions file format options.
374     */
375    public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer,
376            final int groupOriginAddress, final int newChildrenAddress,
377            final FormatOptions formatOptions) {
378        final int originalPosition = buffer.position();
379        buffer.position(groupOriginAddress);
380        final int flags = buffer.readUnsignedByte();
381        final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
382        skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
383        if ((FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
384        final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS
385                ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position();
386        writeSInt24ToBuffer(buffer, childrenOffset);
387        buffer.position(originalPosition);
388    }
389
390    /**
391     * Write a char group to an output stream.
392     * A char group is an in-memory representation of a node in trie.
393     * A char group info is an on-disk representation of a node.
394     *
395     * @param destination the stream to write.
396     * @param info the char group info to be written.
397     * @return the size written, in bytes.
398     */
399    public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info)
400            throws IOException {
401        int size = 1;
402        destination.write((byte)info.mFlags);
403        final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ?
404                FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress;
405        size += writeSInt24ToStream(destination, parentOffset);
406
407        for (int i = 0; i < info.mCharacters.length; ++i) {
408            if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) {
409                destination.write((byte)info.mCharacters[i]);
410                size++;
411            } else {
412                size += writeSInt24ToStream(destination, info.mCharacters[i]);
413            }
414        }
415        if (info.mCharacters.length > 1) {
416            destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
417            size++;
418        }
419
420        if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
421            destination.write((byte)info.mFrequency);
422            size++;
423        }
424
425        final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ?
426                0 : info.mChildrenAddress - info.mOriginalAddress;
427        writeSInt24ToStream(destination, childrenOffset);
428
429        if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) {
430            final int shortcutListSize =
431                    BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
432            destination.write((byte)(shortcutListSize >> 8));
433            destination.write((byte)(shortcutListSize & 0xFF));
434            size += 2;
435            final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator();
436            while (shortcutIterator.hasNext()) {
437                final WeightedString target = shortcutIterator.next();
438                destination.write((byte)BinaryDictInputOutput.makeShortcutFlags(
439                        shortcutIterator.hasNext(), target.mFrequency));
440                size++;
441                size += writeString(destination, target.mWord);
442            }
443        }
444
445        if (info.mBigrams != null) {
446            // TODO: Consolidate this code with the code that computes the size of the bigram list
447            //        in BinaryDictionaryInputOutput#computeActualNodeSize
448            for (int i = 0; i < info.mBigrams.size(); ++i) {
449                final int bigramOffset = info.mBigrams.get(i).mAddress - info.mOriginalAddress;
450                final int bigramFrequency = info.mBigrams.get(i).mFrequency;
451                int bigramFlags = (i < info.mBigrams.size() - 1)
452                        ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0;
453                bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0;
454                switch (BinaryDictInputOutput.getByteSize(bigramOffset)) {
455                case 1:
456                    bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
457                    break;
458                case 2:
459                    bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
460                    break;
461                case 3:
462                    bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
463                    break;
464                }
465                bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY;
466                destination.write((byte)bigramFlags);
467                size++;
468                size += writeVariableAddress(destination, Math.abs(bigramOffset));
469            }
470        }
471        return size;
472    }
473}
474