1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.annotations.UsedForTesting;
20
21import java.io.File;
22import java.io.IOException;
23import java.io.OutputStream;
24import java.nio.ByteBuffer;
25
26/**
27 * Decodes binary files for a FusionDictionary.
28 *
29 * All the methods in this class are static.
30 *
31 * TODO: Move this file to makedict/internal.
32 * TODO: Rename this class to DictDecoderUtils.
33 */
34public final class BinaryDictDecoderUtils {
35    private BinaryDictDecoderUtils() {
36        // This utility class is not publicly instantiable.
37    }
38
39    @UsedForTesting
40    public interface DictBuffer {
41        public int readUnsignedByte();
42        public int readUnsignedShort();
43        public int readUnsignedInt24();
44        public int readInt();
45        public int position();
46        public void position(int newPosition);
47        @UsedForTesting
48        public void put(final byte b);
49        public int limit();
50        @UsedForTesting
51        public int capacity();
52    }
53
54    public static final class ByteBufferDictBuffer implements DictBuffer {
55        private ByteBuffer mBuffer;
56
57        public ByteBufferDictBuffer(final ByteBuffer buffer) {
58            mBuffer = buffer;
59        }
60
61        @Override
62        public int readUnsignedByte() {
63            return mBuffer.get() & 0xFF;
64        }
65
66        @Override
67        public int readUnsignedShort() {
68            return mBuffer.getShort() & 0xFFFF;
69        }
70
71        @Override
72        public int readUnsignedInt24() {
73            final int retval = readUnsignedByte();
74            return (retval << 16) + readUnsignedShort();
75        }
76
77        @Override
78        public int readInt() {
79            return mBuffer.getInt();
80        }
81
82        @Override
83        public int position() {
84            return mBuffer.position();
85        }
86
87        @Override
88        public void position(int newPos) {
89            mBuffer.position(newPos);
90        }
91
92        @Override
93        public void put(final byte b) {
94            mBuffer.put(b);
95        }
96
97        @Override
98        public int limit() {
99            return mBuffer.limit();
100        }
101
102        @Override
103        public int capacity() {
104            return mBuffer.capacity();
105        }
106    }
107
108    /**
109     * A class grouping utility function for our specific character encoding.
110     */
111    static final class CharEncoding {
112        private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
113        private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
114
115        /**
116         * Helper method to find out whether this code fits on one byte
117         */
118        private static boolean fitsOnOneByte(final int character) {
119            return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
120                    && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
121        }
122
123        /**
124         * Compute the size of a character given its character code.
125         *
126         * Char format is:
127         * 1 byte = bbbbbbbb match
128         * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
129         * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
130         *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
131         *       00011111 would be outside unicode.
132         * else: iso-latin-1 code
133         * This allows for the whole unicode range to be encoded, including chars outside of
134         * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
135         * characters which should never happen anyway (and still work, but take 3 bytes).
136         *
137         * @param character the character code.
138         * @return the size in binary encoded-form, either 1 or 3 bytes.
139         */
140        static int getCharSize(final int character) {
141            // See char encoding in FusionDictionary.java
142            if (fitsOnOneByte(character)) return 1;
143            if (FormatSpec.INVALID_CHARACTER == character) return 1;
144            return 3;
145        }
146
147        /**
148         * Compute the byte size of a character array.
149         */
150        static int getCharArraySize(final int[] chars) {
151            int size = 0;
152            for (int character : chars) size += getCharSize(character);
153            return size;
154        }
155
156        /**
157         * Writes a char array to a byte buffer.
158         *
159         * @param codePoints the code point array to write.
160         * @param buffer the byte buffer to write to.
161         * @param index the index in buffer to write the character array to.
162         * @return the index after the last character.
163         */
164        static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
165            for (int codePoint : codePoints) {
166                if (1 == getCharSize(codePoint)) {
167                    buffer[index++] = (byte)codePoint;
168                } else {
169                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
170                    buffer[index++] = (byte)(0xFF & (codePoint >> 8));
171                    buffer[index++] = (byte)(0xFF & codePoint);
172                }
173            }
174            return index;
175        }
176
177        /**
178         * Writes a string with our character format to a byte buffer.
179         *
180         * This will also write the terminator byte.
181         *
182         * @param buffer the byte buffer to write to.
183         * @param origin the offset to write from.
184         * @param word the string to write.
185         * @return the size written, in bytes.
186         */
187        static int writeString(final byte[] buffer, final int origin, final String word) {
188            final int length = word.length();
189            int index = origin;
190            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
191                final int codePoint = word.codePointAt(i);
192                if (1 == getCharSize(codePoint)) {
193                    buffer[index++] = (byte)codePoint;
194                } else {
195                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
196                    buffer[index++] = (byte)(0xFF & (codePoint >> 8));
197                    buffer[index++] = (byte)(0xFF & codePoint);
198                }
199            }
200            buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
201            return index - origin;
202        }
203
204        /**
205         * Writes a string with our character format to an OutputStream.
206         *
207         * This will also write the terminator byte.
208         *
209         * @param stream the OutputStream to write to.
210         * @param word the string to write.
211         * @return the size written, in bytes.
212         */
213        static int writeString(final OutputStream stream, final String word) throws IOException {
214            final int length = word.length();
215            int written = 0;
216            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
217                final int codePoint = word.codePointAt(i);
218                final int charSize = getCharSize(codePoint);
219                if (1 == charSize) {
220                    stream.write((byte) codePoint);
221                } else {
222                    stream.write((byte) (0xFF & (codePoint >> 16)));
223                    stream.write((byte) (0xFF & (codePoint >> 8)));
224                    stream.write((byte) (0xFF & codePoint));
225                }
226                written += charSize;
227            }
228            stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
229            written += FormatSpec.PTNODE_TERMINATOR_SIZE;
230            return written;
231        }
232
233        /**
234         * Reads a string from a DictBuffer. This is the converse of the above method.
235         */
236        static String readString(final DictBuffer dictBuffer) {
237            final StringBuilder s = new StringBuilder();
238            int character = readChar(dictBuffer);
239            while (character != FormatSpec.INVALID_CHARACTER) {
240                s.appendCodePoint(character);
241                character = readChar(dictBuffer);
242            }
243            return s.toString();
244        }
245
246        /**
247         * Reads a character from the buffer.
248         *
249         * This follows the character format documented earlier in this source file.
250         *
251         * @param dictBuffer the buffer, positioned over an encoded character.
252         * @return the character code.
253         */
254        static int readChar(final DictBuffer dictBuffer) {
255            int character = dictBuffer.readUnsignedByte();
256            if (!fitsOnOneByte(character)) {
257                if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
258                    return FormatSpec.INVALID_CHARACTER;
259                }
260                character <<= 16;
261                character += dictBuffer.readUnsignedShort();
262            }
263            return character;
264        }
265    }
266
267    /**
268     * Reads and returns the PtNode count out of a buffer and forwards the pointer.
269     */
270    /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
271        final int msb = dictBuffer.readUnsignedByte();
272        if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
273            return msb;
274        } else {
275            return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
276                    + dictBuffer.readUnsignedByte();
277        }
278    }
279
280    /**
281     * Finds, as a string, the word at the position passed as an argument.
282     *
283     * @param dictDecoder the dict decoder.
284     * @param headerSize the size of the header.
285     * @param pos the position to seek.
286     * @return the word with its frequency, as a weighted string.
287     */
288    @UsedForTesting
289    /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
290            final int headerSize, final int pos) {
291        final WeightedString result;
292        final int originalPos = dictDecoder.getPosition();
293        dictDecoder.setPosition(pos);
294        result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
295        dictDecoder.setPosition(originalPos);
296        return result;
297    }
298
299    private static WeightedString getWordAtPositionWithoutParentAddress(
300            final DictDecoder dictDecoder, final int headerSize, final int pos) {
301        dictDecoder.setPosition(headerSize);
302        final int count = dictDecoder.readPtNodeCount();
303        int groupPos = dictDecoder.getPosition();
304        final StringBuilder builder = new StringBuilder();
305        WeightedString result = null;
306
307        PtNodeInfo last = null;
308        for (int i = count - 1; i >= 0; --i) {
309            PtNodeInfo info = dictDecoder.readPtNode(groupPos);
310            groupPos = info.mEndAddress;
311            if (info.mOriginalAddress == pos) {
312                builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
313                result = new WeightedString(builder.toString(), info.mProbabilityInfo);
314                break; // and return
315            }
316            if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
317                if (info.mChildrenAddress > pos) {
318                    if (null == last) continue;
319                    builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
320                    dictDecoder.setPosition(last.mChildrenAddress);
321                    i = dictDecoder.readPtNodeCount();
322                    groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
323                    last = null;
324                    continue;
325                }
326                last = info;
327            }
328            if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
329                builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
330                dictDecoder.setPosition(last.mChildrenAddress);
331                i = dictDecoder.readPtNodeCount();
332                groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
333                last = null;
334                continue;
335            }
336        }
337        return result;
338    }
339
340    /**
341     * Helper method to pass a file name instead of a File object to isBinaryDictionary.
342     */
343    public static boolean isBinaryDictionary(final String filename) {
344        final File file = new File(filename);
345        return isBinaryDictionary(file);
346    }
347
348    /**
349     * Basic test to find out whether the file is a binary dictionary or not.
350     *
351     * @param file The file to test.
352     * @return true if it's a binary dictionary, false otherwise
353     */
354    public static boolean isBinaryDictionary(final File file) {
355        final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
356        if (dictDecoder == null) {
357            return false;
358        }
359        return dictDecoder.hasValidRawBinaryDictionary();
360    }
361}
362