1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.annotations.UsedForTesting;
20import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
21
22import java.io.File;
23import java.io.IOException;
24import java.io.OutputStream;
25import java.nio.ByteBuffer;
26import java.util.HashMap;
27import java.util.LinkedList;
28
29import javax.annotation.Nonnull;
30
31/**
32 * Decodes binary files for a FusionDictionary.
33 *
34 * All the methods in this class are static.
35 *
36 * TODO: Move this file to makedict/internal.
37 * TODO: Rename this class to DictDecoderUtils.
38 */
39public final class BinaryDictDecoderUtils {
40    private BinaryDictDecoderUtils() {
41        // This utility class is not publicly instantiable.
42    }
43
44    @UsedForTesting
45    public interface DictBuffer {
46        public int readUnsignedByte();
47        public int readUnsignedShort();
48        public int readUnsignedInt24();
49        public int readInt();
50        public int position();
51        public void position(int newPosition);
52        @UsedForTesting
53        public void put(final byte b);
54        public int limit();
55        @UsedForTesting
56        public int capacity();
57    }
58
59    public static final class ByteBufferDictBuffer implements DictBuffer {
60        private ByteBuffer mBuffer;
61
62        public ByteBufferDictBuffer(final ByteBuffer buffer) {
63            mBuffer = buffer;
64        }
65
66        @Override
67        public int readUnsignedByte() {
68            return mBuffer.get() & 0xFF;
69        }
70
71        @Override
72        public int readUnsignedShort() {
73            return mBuffer.getShort() & 0xFFFF;
74        }
75
76        @Override
77        public int readUnsignedInt24() {
78            final int retval = readUnsignedByte();
79            return (retval << 16) + readUnsignedShort();
80        }
81
82        @Override
83        public int readInt() {
84            return mBuffer.getInt();
85        }
86
87        @Override
88        public int position() {
89            return mBuffer.position();
90        }
91
92        @Override
93        public void position(int newPos) {
94            mBuffer.position(newPos);
95        }
96
97        @Override
98        public void put(final byte b) {
99            mBuffer.put(b);
100        }
101
102        @Override
103        public int limit() {
104            return mBuffer.limit();
105        }
106
107        @Override
108        public int capacity() {
109            return mBuffer.capacity();
110        }
111    }
112
113    /**
114     * A class grouping utility function for our specific character encoding.
115     */
116    static final class CharEncoding {
117
118        /**
119         * Helper method to find out whether this code fits on one byte
120         */
121        private static boolean fitsOnOneByte(final int character,
122                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
123            int codePoint = character;
124            if (codePointToOneByteCodeMap != null) {
125                if (codePointToOneByteCodeMap.containsKey(character)) {
126                    codePoint = codePointToOneByteCodeMap.get(character);
127                }
128            }
129            return codePoint >= FormatSpec.MINIMAL_ONE_BYTE_CHARACTER_VALUE
130                    && codePoint <= FormatSpec.MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
131        }
132
133        /**
134         * Compute the size of a character given its character code.
135         *
136         * Char format is:
137         * 1 byte = bbbbbbbb match
138         * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
139         * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
140         *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
141         *       00011111 would be outside unicode.
142         * else: iso-latin-1 code
143         * This allows for the whole unicode range to be encoded, including chars outside of
144         * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
145         * characters which should never happen anyway (and still work, but take 3 bytes).
146         *
147         * @param character the character code.
148         * @return the size in binary encoded-form, either 1 or 3 bytes.
149         */
150        static int getCharSize(final int character,
151                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
152            // See char encoding in FusionDictionary.java
153            if (fitsOnOneByte(character, codePointToOneByteCodeMap)) return 1;
154            if (FormatSpec.INVALID_CHARACTER == character) return 1;
155            return 3;
156        }
157
158        /**
159         * Compute the byte size of a character array.
160         */
161        static int getCharArraySize(final int[] chars,
162                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
163            int size = 0;
164            for (int character : chars) size += getCharSize(character, codePointToOneByteCodeMap);
165            return size;
166        }
167
168        /**
169         * Writes a char array to a byte buffer.
170         *
171         * @param codePoints the code point array to write.
172         * @param buffer the byte buffer to write to.
173         * @param fromIndex the index in buffer to write the character array to.
174         * @param codePointToOneByteCodeMap the map to convert the code point.
175         * @return the index after the last character.
176         */
177        static int writeCharArray(final int[] codePoints, final byte[] buffer, final int fromIndex,
178                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
179            int index = fromIndex;
180            for (int codePoint : codePoints) {
181                if (codePointToOneByteCodeMap != null) {
182                    if (codePointToOneByteCodeMap.containsKey(codePoint)) {
183                        // Convert code points
184                        codePoint = codePointToOneByteCodeMap.get(codePoint);
185                    }
186                }
187                if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
188                    buffer[index++] = (byte)codePoint;
189                } else {
190                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
191                    buffer[index++] = (byte)(0xFF & (codePoint >> 8));
192                    buffer[index++] = (byte)(0xFF & codePoint);
193                }
194            }
195            return index;
196        }
197
198        /**
199         * Writes a string with our character format to a byte buffer.
200         *
201         * This will also write the terminator byte.
202         *
203         * @param buffer the byte buffer to write to.
204         * @param origin the offset to write from.
205         * @param word the string to write.
206         * @return the size written, in bytes.
207         */
208        static int writeString(final byte[] buffer, final int origin, final String word,
209                final HashMap<Integer, Integer> codePointToOneByteCodeMap) {
210            final int length = word.length();
211            int index = origin;
212            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
213                int codePoint = word.codePointAt(i);
214                if (codePointToOneByteCodeMap != null) {
215                    if (codePointToOneByteCodeMap.containsKey(codePoint)) {
216                        // Convert code points
217                        codePoint = codePointToOneByteCodeMap.get(codePoint);
218                    }
219                }
220                if (1 == getCharSize(codePoint, codePointToOneByteCodeMap)) {
221                    buffer[index++] = (byte)codePoint;
222                } else {
223                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
224                    buffer[index++] = (byte)(0xFF & (codePoint >> 8));
225                    buffer[index++] = (byte)(0xFF & codePoint);
226                }
227            }
228            buffer[index++] = FormatSpec.PTNODE_CHARACTERS_TERMINATOR;
229            return index - origin;
230        }
231
232        /**
233         * Writes a string with our character format to an OutputStream.
234         *
235         * This will also write the terminator byte.
236         *
237         * @param stream the OutputStream to write to.
238         * @param word the string to write.
239         * @return the size written, in bytes.
240         */
241        static int writeString(final OutputStream stream, final String word,
242                final HashMap<Integer, Integer> codePointToOneByteCodeMap) throws IOException {
243            final int length = word.length();
244            int written = 0;
245            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
246                final int codePoint = word.codePointAt(i);
247                final int charSize = getCharSize(codePoint, codePointToOneByteCodeMap);
248                if (1 == charSize) {
249                    stream.write((byte) codePoint);
250                } else {
251                    stream.write((byte) (0xFF & (codePoint >> 16)));
252                    stream.write((byte) (0xFF & (codePoint >> 8)));
253                    stream.write((byte) (0xFF & codePoint));
254                }
255                written += charSize;
256            }
257            stream.write(FormatSpec.PTNODE_CHARACTERS_TERMINATOR);
258            written += FormatSpec.PTNODE_TERMINATOR_SIZE;
259            return written;
260        }
261
262        /**
263         * Reads a string from a DictBuffer. This is the converse of the above method.
264         */
265        static String readString(final DictBuffer dictBuffer) {
266            final StringBuilder s = new StringBuilder();
267            int character = readChar(dictBuffer);
268            while (character != FormatSpec.INVALID_CHARACTER) {
269                s.appendCodePoint(character);
270                character = readChar(dictBuffer);
271            }
272            return s.toString();
273        }
274
275        /**
276         * Reads a character from the buffer.
277         *
278         * This follows the character format documented earlier in this source file.
279         *
280         * @param dictBuffer the buffer, positioned over an encoded character.
281         * @return the character code.
282         */
283        static int readChar(final DictBuffer dictBuffer) {
284            int character = dictBuffer.readUnsignedByte();
285            if (!fitsOnOneByte(character, null)) {
286                if (FormatSpec.PTNODE_CHARACTERS_TERMINATOR == character) {
287                    return FormatSpec.INVALID_CHARACTER;
288                }
289                character <<= 16;
290                character += dictBuffer.readUnsignedShort();
291            }
292            return character;
293        }
294    }
295
296    /**
297     * Reads and returns the PtNode count out of a buffer and forwards the pointer.
298     */
299    /* package */ static int readPtNodeCount(final DictBuffer dictBuffer) {
300        final int msb = dictBuffer.readUnsignedByte();
301        if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= msb) {
302            return msb;
303        }
304        return ((FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT & msb) << 8)
305                + dictBuffer.readUnsignedByte();
306    }
307
308    /**
309     * Finds, as a string, the word at the position passed as an argument.
310     *
311     * @param dictDecoder the dict decoder.
312     * @param headerSize the size of the header.
313     * @param pos the position to seek.
314     * @return the word with its frequency, as a weighted string.
315     */
316    @UsedForTesting
317    /* package for tests */ static WeightedString getWordAtPosition(final DictDecoder dictDecoder,
318            final int headerSize, final int pos) {
319        final WeightedString result;
320        final int originalPos = dictDecoder.getPosition();
321        dictDecoder.setPosition(pos);
322        result = getWordAtPositionWithoutParentAddress(dictDecoder, headerSize, pos);
323        dictDecoder.setPosition(originalPos);
324        return result;
325    }
326
327    private static WeightedString getWordAtPositionWithoutParentAddress(
328            final DictDecoder dictDecoder, final int headerSize, final int pos) {
329        dictDecoder.setPosition(headerSize);
330        final int count = dictDecoder.readPtNodeCount();
331        int groupPos = dictDecoder.getPosition();
332        final StringBuilder builder = new StringBuilder();
333        WeightedString result = null;
334
335        PtNodeInfo last = null;
336        for (int i = count - 1; i >= 0; --i) {
337            PtNodeInfo info = dictDecoder.readPtNode(groupPos);
338            groupPos = info.mEndAddress;
339            if (info.mOriginalAddress == pos) {
340                builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
341                result = new WeightedString(builder.toString(), info.mProbabilityInfo);
342                break; // and return
343            }
344            if (BinaryDictIOUtils.hasChildrenAddress(info.mChildrenAddress)) {
345                if (info.mChildrenAddress > pos) {
346                    if (null == last) continue;
347                    builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
348                    dictDecoder.setPosition(last.mChildrenAddress);
349                    i = dictDecoder.readPtNodeCount();
350                    groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
351                    last = null;
352                    continue;
353                }
354                last = info;
355            }
356            if (0 == i && BinaryDictIOUtils.hasChildrenAddress(last.mChildrenAddress)) {
357                builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
358                dictDecoder.setPosition(last.mChildrenAddress);
359                i = dictDecoder.readPtNodeCount();
360                groupPos = last.mChildrenAddress + BinaryDictIOUtils.getPtNodeCountSize(i);
361                last = null;
362                continue;
363            }
364        }
365        return result;
366    }
367
368    /**
369     * Helper method that brutally decodes a header from a byte array.
370     *
371     * @param headerBuffer a buffer containing the bytes of the header.
372     * @return a hashmap of the attributes stored in the header
373     */
374    @Nonnull
375    public static HashMap<String, String> decodeHeaderAttributes(@Nonnull final byte[] headerBuffer)
376            throws UnsupportedFormatException {
377        final StringBuilder sb = new StringBuilder();
378        final LinkedList<String> keyValues = new LinkedList<>();
379        int index = 0;
380        while (index < headerBuffer.length) {
381            if (headerBuffer[index] == FormatSpec.PTNODE_CHARACTERS_TERMINATOR) {
382                keyValues.add(sb.toString());
383                sb.setLength(0);
384            } else if (CharEncoding.fitsOnOneByte(headerBuffer[index] & 0xFF,
385                    null /* codePointTable */)) {
386                sb.appendCodePoint(headerBuffer[index] & 0xFF);
387            } else {
388                sb.appendCodePoint(((headerBuffer[index] & 0xFF) << 16)
389                        + ((headerBuffer[index + 1] & 0xFF) << 8)
390                        + (headerBuffer[index + 2] & 0xFF));
391                index += 2;
392            }
393            index += 1;
394        }
395        if ((keyValues.size() & 1) != 0) {
396            throw new UnsupportedFormatException("Odd number of attributes");
397        }
398        final HashMap<String, String> attributes = new HashMap<>();
399        for (int i = 0; i < keyValues.size(); i += 2) {
400            attributes.put(keyValues.get(i), keyValues.get(i + 1));
401        }
402        return attributes;
403    }
404
405    /**
406     * Helper method to pass a file name instead of a File object to isBinaryDictionary.
407     */
408    public static boolean isBinaryDictionary(final String filename) {
409        final File file = new File(filename);
410        return isBinaryDictionary(file);
411    }
412
413    /**
414     * Basic test to find out whether the file is a binary dictionary or not.
415     *
416     * @param file The file to test.
417     * @return true if it's a binary dictionary, false otherwise
418     */
419    public static boolean isBinaryDictionary(final File file) {
420        final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
421        if (dictDecoder == null) {
422            return false;
423        }
424        return dictDecoder.hasValidRawBinaryDictionary();
425    }
426}
427