114087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada/*
214087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * Copyright (C) 2013 The Android Open Source Project
314087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada *
414087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * Licensed under the Apache License, Version 2.0 (the "License");
514087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * you may not use this file except in compliance with the License.
614087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * You may obtain a copy of the License at
714087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada *
814087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada *      http://www.apache.org/licenses/LICENSE-2.0
914087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada *
1014087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * Unless required by applicable law or agreed to in writing, software
1114087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * distributed under the License is distributed on an "AS IS" BASIS,
1214087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1314087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * See the License for the specific language governing permissions and
1414087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * limitations under the License.
1514087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada */
1614087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada
1714087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanadapackage com.android.inputmethod.latin.makedict;
1814087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada
1914087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanadaimport com.android.inputmethod.annotations.UsedForTesting;
20ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagiimport com.android.inputmethod.latin.BinaryDictionary;
215b91b551e5ffaf2c2e691dfbd434f21c82293986Jean Chalardimport com.android.inputmethod.latin.common.FileUtils;
2214087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada
2314087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanadaimport java.io.File;
2414087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanadaimport java.io.FileNotFoundException;
2514087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanadaimport java.io.IOException;
2614087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanadaimport java.util.ArrayList;
2714087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada
2814087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada/**
2914087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada * An implementation of binary dictionary decoder for version 4 binary dictionary.
3014087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada */
3114087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada@UsedForTesting
3214d31d464037c31e7f7d382a8a86f6acf4694b06Yuichiro Hanadapublic class Ver4DictDecoder extends AbstractDictDecoder {
33ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi    final File mDictDirectory;
3414087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada
3514087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada    @UsedForTesting
36d3a4c5132422b189c8dbb94dbbe84a9b9761b0a8Tadashi G. Takaoka    /* package */ Ver4DictDecoder(final File dictDirectory) {
3714087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada        mDictDirectory = dictDirectory;
38d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi
3914087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada    }
4014087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada
4114087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada    @Override
42b986f78ba826fa360304a69565f1880bdd7ce0c5Keisuke Kuroyanagi    public DictionaryHeader readHeader() throws IOException, UnsupportedFormatException {
43d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi        // dictType is not being used in dicttool. Passing an empty string.
44d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi        final BinaryDictionary binaryDictionary= new BinaryDictionary(
45d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi              mDictDirectory.getAbsolutePath(), 0 /* offset */, 0 /* length */,
46d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi              true /* useFullEditDistance */, null /* locale */,
47d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi              "" /* dictType */, true /* isUpdatable */);
48d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi        final DictionaryHeader header = binaryDictionary.getHeader();
49d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi        binaryDictionary.close();
50afd9b62f00cd4557b32dae5bed6ed40320f86857Keisuke Kuroyanagi        if (header == null) {
51afd9b62f00cd4557b32dae5bed6ed40320f86857Keisuke Kuroyanagi            throw new IOException("Cannot read the dictionary header.");
52afd9b62f00cd4557b32dae5bed6ed40320f86857Keisuke Kuroyanagi        }
53d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi        return header;
5414087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada    }
5514087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada
5614087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada    @Override
578e3a1d0f89ac5a0c7d31effb8cbb447f93f70310Keisuke Kuroyanagi    public FusionDictionary readDictionaryBinary(final boolean deleteDictIfBroken)
5814087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada            throws FileNotFoundException, IOException, UnsupportedFormatException {
59d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi        // dictType is not being used in dicttool. Passing an empty string.
60d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi        final BinaryDictionary binaryDictionary = new BinaryDictionary(
61d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi              mDictDirectory.getAbsolutePath(), 0 /* offset */, 0 /* length */,
62d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi              true /* useFullEditDistance */, null /* locale */,
63d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi              "" /* dictType */, true /* isUpdatable */);
64ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi        final DictionaryHeader header = readHeader();
658e3a1d0f89ac5a0c7d31effb8cbb447f93f70310Keisuke Kuroyanagi        final FusionDictionary fusionDict =
66ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi                new FusionDictionary(new FusionDictionary.PtNodeArray(), header.mDictionaryOptions);
67ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi        int token = 0;
68a91561aa58db1c43092c1caecc051a11fa5391c7Tadashi G. Takaoka        final ArrayList<WordProperty> wordProperties = new ArrayList<>();
69ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi        do {
70ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi            final BinaryDictionary.GetNextWordPropertyResult result =
71d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi                    binaryDictionary.getNextWordProperty(token);
72ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi            final WordProperty wordProperty = result.mWordProperty;
73ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi            if (wordProperty == null) {
74d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi                binaryDictionary.close();
75ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi                if (deleteDictIfBroken) {
76ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi                    FileUtils.deleteRecursively(mDictDirectory);
77ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi                }
78ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi                return null;
7914087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada            }
80ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi            wordProperties.add(wordProperty);
81ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi            token = result.mNextToken;
82ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi        } while (token != 0);
83ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi
840fc93fe4455f24809f6c9baf0d3b936519779cfbKeisuke Kuroyanagi        // Insert unigrams into the fusion dictionary.
85ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi        for (final WordProperty wordProperty : wordProperties) {
8605172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu            fusionDict.add(wordProperty.mWord, wordProperty.mProbabilityInfo,
8712d80ebead6a1d7f704a5a3af3b6fe3313ceab05Dan Zivkovic                    wordProperty.mIsNotAWord,
8805172bf1a5693c2e108e91436b98ecd35d2dadadAdrian Velicu                    wordProperty.mIsPossiblyOffensive);
8914087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada        }
900fc93fe4455f24809f6c9baf0d3b936519779cfbKeisuke Kuroyanagi        // Insert bigrams into the fusion dictionary.
91c6a6f6a9905ab98516d944ac85933d016e4147fbKeisuke Kuroyanagi        // TODO: Support ngrams.
92ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi        for (final WordProperty wordProperty : wordProperties) {
93c6a6f6a9905ab98516d944ac85933d016e4147fbKeisuke Kuroyanagi            if (!wordProperty.mHasNgrams) {
94ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi                continue;
95ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi            }
96ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi            final String word0 = wordProperty.mWord;
97c6a6f6a9905ab98516d944ac85933d016e4147fbKeisuke Kuroyanagi            for (final WeightedString bigram : wordProperty.getBigrams()) {
988ffc631826b108423f98e3ff4d987f067cbc4e0cKeisuke Kuroyanagi                fusionDict.setBigram(word0, bigram.mWord, bigram.mProbabilityInfo);
992fa3693c264a4c150ac307d9bb7f6f8f18cc4ffcKen Wakasa            }
10014087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada        }
101d24a99cff6da3a7121a507e77409261e4f6704dcKeisuke Kuroyanagi        binaryDictionary.close();
102ab6a93773ba3cbe93002bc37b6b61f874fc09144Keisuke Kuroyanagi        return fusionDict;
103bc4926235dfff4758ca435362fe7a880d11b4f3bYuichiro Hanada    }
10414087ba52c6b5b7acd25ee4a1ef1663ceb72bbf4Yuichiro Hanada}
105