1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.annotations.UsedForTesting;
20import com.android.inputmethod.latin.BinaryDictionary;
21import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
22import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
23
24import java.io.File;
25import java.io.FileNotFoundException;
26import java.io.IOException;
27import java.util.ArrayList;
28import java.util.Arrays;
29
30/**
31 * An implementation of DictDecoder for version 2 binary dictionary.
32 */
33// TODO: Separate logics that are used only for testing.
34@UsedForTesting
35public class Ver2DictDecoder extends AbstractDictDecoder {
36    /**
37     * A utility class for reading a PtNode.
38     */
39    protected static class PtNodeReader {
40        private static ProbabilityInfo readProbabilityInfo(final DictBuffer dictBuffer) {
41            // Ver2 dicts don't contain historical information.
42            return new ProbabilityInfo(dictBuffer.readUnsignedByte());
43        }
44
45        protected static int readPtNodeOptionFlags(final DictBuffer dictBuffer) {
46            return dictBuffer.readUnsignedByte();
47        }
48
49        protected static int readChildrenAddress(final DictBuffer dictBuffer,
50                final int ptNodeFlags) {
51            switch (ptNodeFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) {
52                case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE:
53                    return dictBuffer.readUnsignedByte();
54                case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES:
55                    return dictBuffer.readUnsignedShort();
56                case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES:
57                    return dictBuffer.readUnsignedInt24();
58                case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS:
59                default:
60                    return FormatSpec.NO_CHILDREN_ADDRESS;
61            }
62        }
63
64        // Reads shortcuts and returns the read length.
65        protected static int readShortcut(final DictBuffer dictBuffer,
66                final ArrayList<WeightedString> shortcutTargets) {
67            final int pointerBefore = dictBuffer.position();
68            dictBuffer.readUnsignedShort(); // skip the size
69            while (true) {
70                final int targetFlags = dictBuffer.readUnsignedByte();
71                final String word = CharEncoding.readString(dictBuffer);
72                shortcutTargets.add(new WeightedString(word,
73                        targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY));
74                if (0 == (targetFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
75            }
76            return dictBuffer.position() - pointerBefore;
77        }
78
79        protected static int readBigramAddresses(final DictBuffer dictBuffer,
80                final ArrayList<PendingAttribute> bigrams, final int baseAddress) {
81            int readLength = 0;
82            int bigramCount = 0;
83            while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
84                final int bigramFlags = dictBuffer.readUnsignedByte();
85                ++readLength;
86                final int sign = 0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE)
87                        ? 1 : -1;
88                int bigramAddress = baseAddress + readLength;
89                switch (bigramFlags & FormatSpec.MASK_BIGRAM_ATTR_ADDRESS_TYPE) {
90                    case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE:
91                        bigramAddress += sign * dictBuffer.readUnsignedByte();
92                        readLength += 1;
93                        break;
94                    case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES:
95                        bigramAddress += sign * dictBuffer.readUnsignedShort();
96                        readLength += 2;
97                        break;
98                    case FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES:
99                        bigramAddress += sign * dictBuffer.readUnsignedInt24();
100                        readLength += 3;
101                        break;
102                    default:
103                        throw new RuntimeException("Has bigrams with no address");
104                }
105                bigrams.add(new PendingAttribute(
106                        bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY,
107                        bigramAddress));
108                if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break;
109            }
110            return readLength;
111        }
112    }
113
114    protected final File mDictionaryBinaryFile;
115    protected final long mOffset;
116    protected final long mLength;
117    // TODO: Remove mBufferFactory and mDictBuffer from this class members because they are now
118    // used only for testing.
119    private final DictionaryBufferFactory mBufferFactory;
120    protected DictBuffer mDictBuffer;
121
122    @UsedForTesting
123    /* package */ Ver2DictDecoder(final File file, final long offset, final long length,
124            final int factoryFlag) {
125        mDictionaryBinaryFile = file;
126        mOffset = offset;
127        mLength = length;
128        mDictBuffer = null;
129        if ((factoryFlag & MASK_DICTBUFFER) == USE_READONLY_BYTEBUFFER) {
130            mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
131        } else if ((factoryFlag  & MASK_DICTBUFFER) == USE_BYTEARRAY) {
132            mBufferFactory = new DictionaryBufferFromByteArrayFactory();
133        } else if ((factoryFlag & MASK_DICTBUFFER) == USE_WRITABLE_BYTEBUFFER) {
134            mBufferFactory = new DictionaryBufferFromWritableByteBufferFactory();
135        } else {
136            mBufferFactory = new DictionaryBufferFromReadOnlyByteBufferFactory();
137        }
138    }
139
140    /* package */ Ver2DictDecoder(final File file, final long offset, final long length,
141            final DictionaryBufferFactory factory) {
142        mDictionaryBinaryFile = file;
143        mOffset = offset;
144        mLength = length;
145        mBufferFactory = factory;
146    }
147
148    @Override
149    public void openDictBuffer() throws FileNotFoundException, IOException {
150        mDictBuffer = mBufferFactory.getDictionaryBuffer(mDictionaryBinaryFile);
151    }
152
153    @Override
154    public boolean isDictBufferOpen() {
155        return mDictBuffer != null;
156    }
157
158    /* package */ DictBuffer getDictBuffer() {
159        return mDictBuffer;
160    }
161
162    @UsedForTesting
163    /* package */ DictBuffer openAndGetDictBuffer() throws FileNotFoundException, IOException {
164        openDictBuffer();
165        return getDictBuffer();
166    }
167
168    @Override
169    public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException {
170        // dictType is not being used in dicttool. Passing an empty string.
171        final BinaryDictionary binaryDictionary = new BinaryDictionary(
172                mDictionaryBinaryFile.getAbsolutePath(), mOffset, mLength,
173                true /* useFullEditDistance */, null /* locale */, "" /* dictType */,
174                false /* isUpdatable */);
175        final DictionaryHeader header = binaryDictionary.getHeader();
176        binaryDictionary.close();
177        if (header == null) {
178            throw new IOException("Cannot read the dictionary header.");
179        }
180        if (header.mFormatOptions.mVersion != FormatSpec.VERSION2) {
181            throw new UnsupportedFormatException("File header has a wrong version : "
182                    + header.mFormatOptions.mVersion);
183        }
184        if (!isDictBufferOpen()) {
185            openDictBuffer();
186        }
187        // Advance buffer reading position to the head of dictionary body.
188        setPosition(header.mBodyOffset);
189        return header;
190    }
191
192    // TODO: Make this buffer multi thread safe.
193    private final int[] mCharacterBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
194    @Override
195    public PtNodeInfo readPtNode(final int ptNodePos) {
196        int addressPointer = ptNodePos;
197        final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer);
198        addressPointer += FormatSpec.PTNODE_FLAGS_SIZE;
199        final int characters[];
200        if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) {
201            int index = 0;
202            int character = CharEncoding.readChar(mDictBuffer);
203            addressPointer += CharEncoding.getCharSize(character);
204            while (FormatSpec.INVALID_CHARACTER != character) {
205                // FusionDictionary is making sure that the length of the word is smaller than
206                // MAX_WORD_LENGTH.
207                // So we'll never write past the end of mCharacterBuffer.
208                mCharacterBuffer[index++] = character;
209                character = CharEncoding.readChar(mDictBuffer);
210                addressPointer += CharEncoding.getCharSize(character);
211            }
212            characters = Arrays.copyOfRange(mCharacterBuffer, 0, index);
213        } else {
214            final int character = CharEncoding.readChar(mDictBuffer);
215            addressPointer += CharEncoding.getCharSize(character);
216            characters = new int[] { character };
217        }
218        final ProbabilityInfo probabilityInfo;
219        if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) {
220            probabilityInfo = PtNodeReader.readProbabilityInfo(mDictBuffer);
221            addressPointer += FormatSpec.PTNODE_FREQUENCY_SIZE;
222        } else {
223            probabilityInfo = null;
224        }
225        int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags);
226        if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
227            childrenAddress += addressPointer;
228        }
229        addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags);
230        final ArrayList<WeightedString> shortcutTargets;
231        if (0 != (flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)) {
232            // readShortcut will add shortcuts to shortcutTargets.
233            shortcutTargets = new ArrayList<>();
234            addressPointer += PtNodeReader.readShortcut(mDictBuffer, shortcutTargets);
235        } else {
236            shortcutTargets = null;
237        }
238
239        final ArrayList<PendingAttribute> bigrams;
240        if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) {
241            bigrams = new ArrayList<>();
242            addressPointer += PtNodeReader.readBigramAddresses(mDictBuffer, bigrams,
243                    addressPointer);
244            if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) {
245                throw new RuntimeException("Too many bigrams in a PtNode (" + bigrams.size()
246                        + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")");
247            }
248        } else {
249            bigrams = null;
250        }
251        return new PtNodeInfo(ptNodePos, addressPointer, flags, characters, probabilityInfo,
252                childrenAddress, shortcutTargets, bigrams);
253    }
254
255    @Override
256    public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken)
257            throws FileNotFoundException, IOException, UnsupportedFormatException {
258        // dictType is not being used in dicttool. Passing an empty string.
259        final BinaryDictionary binaryDictionary = new BinaryDictionary(
260                mDictionaryBinaryFile.getAbsolutePath(), 0 /* offset */,
261                mDictionaryBinaryFile.length() /* length */, true /* useFullEditDistance */,
262                null /* locale */, "" /* dictType */, false /* isUpdatable */);
263        final DictionaryHeader header = readHeader();
264        final FusionDictionary fusionDict =
265                new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions);
266        int token = 0;
267        final ArrayList<WordProperty> wordProperties = new ArrayList<>();
268        do {
269            final BinaryDictionary.GetNextWordPropertyResult result =
270                    binaryDictionary.getNextWordProperty(token);
271            final WordProperty wordProperty = result.mWordProperty;
272            if (wordProperty == null) {
273                binaryDictionary.close();
274                if (deleteDictIfBroken) {
275                    mDictionaryBinaryFile.delete();
276                }
277                return null;
278            }
279            wordProperties.add(wordProperty);
280            token = result.mNextToken;
281        } while (token != 0);
282
283        // Insert unigrams into the fusion dictionary.
284        for (final WordProperty wordProperty : wordProperties) {
285            if (wordProperty.mIsBlacklistEntry) {
286                fusionDict.addBlacklistEntry(wordProperty.mWord, wordProperty.mShortcutTargets,
287                        wordProperty.mIsNotAWord);
288            } else {
289                fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
290                        wordProperty.mShortcutTargets, wordProperty.mIsNotAWord);
291            }
292        }
293        // Insert bigrams into the fusion dictionary.
294        for (final WordProperty wordProperty : wordProperties) {
295            if (wordProperty.mBigrams == null) {
296                continue;
297            }
298            final String word0 = wordProperty.mWord;
299            for (final WeightedString bigram : wordProperty.mBigrams) {
300                fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo);
301            }
302        }
303        binaryDictionary.close();
304        return fusionDict;
305    }
306
307    @Override
308    public void setPosition(int newPos) {
309        mDictBuffer.position(newPos);
310    }
311
312    @Override
313    public int getPosition() {
314        return mDictBuffer.position();
315    }
316
317    @Override
318    public int readPtNodeCount() {
319        return BinaryDictDecoderUtils.readPtNodeCount(mDictBuffer);
320    }
321}
322