DictionaryInfoUtils.java revision d9a8f2a82f6a0157c48ff1d0f8b2e05d40618426
1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.inputmethod.latin.utils;
18
19import android.content.ContentValues;
20import android.content.Context;
21import android.content.res.AssetManager;
22import android.content.res.Resources;
23import android.text.TextUtils;
24import android.util.Log;
25
26import com.android.inputmethod.latin.AssetFileAddress;
27import com.android.inputmethod.latin.BinaryDictionaryGetter;
28import com.android.inputmethod.latin.Constants;
29import com.android.inputmethod.latin.R;
30import com.android.inputmethod.latin.makedict.DictionaryHeader;
31import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
32import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
33
34import java.io.File;
35import java.io.IOException;
36import java.util.ArrayList;
37import java.util.Iterator;
38import java.util.Locale;
39import java.util.concurrent.TimeUnit;
40
41/**
42 * This class encapsulates the logic for the Latin-IME side of dictionary information management.
43 */
44public class DictionaryInfoUtils {
45    private static final String TAG = DictionaryInfoUtils.class.getSimpleName();
46    private static final String RESOURCE_PACKAGE_NAME = R.class.getPackage().getName();
47    private static final String DEFAULT_MAIN_DICT = "main";
48    private static final String MAIN_DICT_PREFIX = "main_";
49    // 6 digits - unicode is limited to 21 bits
50    private static final int MAX_HEX_DIGITS_FOR_CODEPOINT = 6;
51
52    public static class DictionaryInfo {
53        private static final String LOCALE_COLUMN = "locale";
54        private static final String WORDLISTID_COLUMN = "id";
55        private static final String LOCAL_FILENAME_COLUMN = "filename";
56        private static final String DESCRIPTION_COLUMN = "description";
57        private static final String DATE_COLUMN = "date";
58        private static final String FILESIZE_COLUMN = "filesize";
59        private static final String VERSION_COLUMN = "version";
60        public final String mId;
61        public final Locale mLocale;
62        public final String mDescription;
63        public final AssetFileAddress mFileAddress;
64        public final int mVersion;
65        public DictionaryInfo(final String id, final Locale locale, final String description,
66                final AssetFileAddress fileAddress, final int version) {
67            mId = id;
68            mLocale = locale;
69            mDescription = description;
70            mFileAddress = fileAddress;
71            mVersion = version;
72        }
73        public ContentValues toContentValues() {
74            final ContentValues values = new ContentValues();
75            values.put(WORDLISTID_COLUMN, mId);
76            values.put(LOCALE_COLUMN, mLocale.toString());
77            values.put(DESCRIPTION_COLUMN, mDescription);
78            values.put(LOCAL_FILENAME_COLUMN, mFileAddress.mFilename);
79            values.put(DATE_COLUMN, TimeUnit.MILLISECONDS.toSeconds(
80                    new File(mFileAddress.mFilename).lastModified()));
81            values.put(FILESIZE_COLUMN, mFileAddress.mLength);
82            values.put(VERSION_COLUMN, mVersion);
83            return values;
84        }
85    }
86
87    private DictionaryInfoUtils() {
88        // Private constructor to forbid instantation of this helper class.
89    }
90
91    /**
92     * Returns whether we may want to use this character as part of a file name.
93     *
94     * This basically only accepts ascii letters and numbers, and rejects everything else.
95     */
96    private static boolean isFileNameCharacter(int codePoint) {
97        if (codePoint >= 0x30 && codePoint <= 0x39) return true; // Digit
98        if (codePoint >= 0x41 && codePoint <= 0x5A) return true; // Uppercase
99        if (codePoint >= 0x61 && codePoint <= 0x7A) return true; // Lowercase
100        return codePoint == '_'; // Underscore
101    }
102
103    /**
104     * Escapes a string for any characters that may be suspicious for a file or directory name.
105     *
106     * Concretely this does a sort of URL-encoding except it will encode everything that's not
107     * alphanumeric or underscore. (true URL-encoding leaves alone characters like '*', which
108     * we cannot allow here)
109     */
110    // TODO: create a unit test for this method
111    public static String replaceFileNameDangerousCharacters(final String name) {
112        // This assumes '%' is fully available as a non-separator, normal
113        // character in a file name. This is probably true for all file systems.
114        final StringBuilder sb = new StringBuilder();
115        final int nameLength = name.length();
116        for (int i = 0; i < nameLength; i = name.offsetByCodePoints(i, 1)) {
117            final int codePoint = name.codePointAt(i);
118            if (DictionaryInfoUtils.isFileNameCharacter(codePoint)) {
119                sb.appendCodePoint(codePoint);
120            } else {
121                sb.append(String.format((Locale)null, "%%%1$0" + MAX_HEX_DIGITS_FOR_CODEPOINT + "x",
122                        codePoint));
123            }
124        }
125        return sb.toString();
126    }
127
128    /**
129     * Helper method to get the top level cache directory.
130     */
131    private static String getWordListCacheDirectory(final Context context) {
132        return context.getFilesDir() + File.separator + "dicts";
133    }
134
135    /**
136     * Helper method to get the top level temp directory.
137     */
138    public static String getWordListTempDirectory(final Context context) {
139        return context.getFilesDir() + File.separator + "tmp";
140    }
141
142    /**
143     * Reverse escaping done by replaceFileNameDangerousCharacters.
144     */
145    public static String getWordListIdFromFileName(final String fname) {
146        final StringBuilder sb = new StringBuilder();
147        final int fnameLength = fname.length();
148        for (int i = 0; i < fnameLength; i = fname.offsetByCodePoints(i, 1)) {
149            final int codePoint = fname.codePointAt(i);
150            if ('%' != codePoint) {
151                sb.appendCodePoint(codePoint);
152            } else {
153                // + 1 to pass the % sign
154                final int encodedCodePoint = Integer.parseInt(
155                        fname.substring(i + 1, i + 1 + MAX_HEX_DIGITS_FOR_CODEPOINT), 16);
156                i += MAX_HEX_DIGITS_FOR_CODEPOINT;
157                sb.appendCodePoint(encodedCodePoint);
158            }
159        }
160        return sb.toString();
161    }
162
163    /**
164     * Helper method to the list of cache directories, one for each distinct locale.
165     */
166    public static File[] getCachedDirectoryList(final Context context) {
167        return new File(DictionaryInfoUtils.getWordListCacheDirectory(context)).listFiles();
168    }
169
170    /**
171     * Returns the category for a given file name.
172     *
173     * This parses the file name, extracts the category, and returns it. See
174     * {@link #getMainDictId(Locale)} and {@link #isMainWordListId(String)}.
175     * @return The category as a string or null if it can't be found in the file name.
176     */
177    public static String getCategoryFromFileName(final String fileName) {
178        final String id = getWordListIdFromFileName(fileName);
179        final String[] idArray = id.split(BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR);
180        // An id is supposed to be in format category:locale, so splitting on the separator
181        // should yield a 2-elements array
182        if (2 != idArray.length) return null;
183        return idArray[0];
184    }
185
186    /**
187     * Find out the cache directory associated with a specific locale.
188     */
189    private static String getCacheDirectoryForLocale(final String locale, final Context context) {
190        final String relativeDirectoryName = replaceFileNameDangerousCharacters(locale);
191        final String absoluteDirectoryName = getWordListCacheDirectory(context) + File.separator
192                + relativeDirectoryName;
193        final File directory = new File(absoluteDirectoryName);
194        if (!directory.exists()) {
195            if (!directory.mkdirs()) {
196                Log.e(TAG, "Could not create the directory for locale" + locale);
197            }
198        }
199        return absoluteDirectoryName;
200    }
201
202    /**
203     * Generates a file name for the id and locale passed as an argument.
204     *
205     * In the current implementation the file name returned will always be unique for
206     * any id/locale pair, but please do not expect that the id can be the same for
207     * different dictionaries with different locales. An id should be unique for any
208     * dictionary.
209     * The file name is pretty much an URL-encoded version of the id inside a directory
210     * named like the locale, except it will also escape characters that look dangerous
211     * to some file systems.
212     * @param id the id of the dictionary for which to get a file name
213     * @param locale the locale for which to get the file name as a string
214     * @param context the context to use for getting the directory
215     * @return the name of the file to be created
216     */
217    public static String getCacheFileName(String id, String locale, Context context) {
218        final String fileName = replaceFileNameDangerousCharacters(id);
219        return getCacheDirectoryForLocale(locale, context) + File.separator + fileName;
220    }
221
222    public static boolean isMainWordListId(final String id) {
223        final String[] idArray = id.split(BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR);
224        // An id is supposed to be in format category:locale, so splitting on the separator
225        // should yield a 2-elements array
226        if (2 != idArray.length) return false;
227        return BinaryDictionaryGetter.MAIN_DICTIONARY_CATEGORY.equals(idArray[0]);
228    }
229
230    /**
231     * Helper method to return a dictionary res id for a locale, or 0 if none.
232     * @param locale dictionary locale
233     * @return main dictionary resource id
234     */
235    public static int getMainDictionaryResourceIdIfAvailableForLocale(final Resources res,
236            final Locale locale) {
237        int resId;
238        // Try to find main_language_country dictionary.
239        if (!locale.getCountry().isEmpty()) {
240            final String dictLanguageCountry =
241                    MAIN_DICT_PREFIX + locale.toString().toLowerCase(Locale.ROOT);
242            if ((resId = res.getIdentifier(
243                    dictLanguageCountry, "raw", RESOURCE_PACKAGE_NAME)) != 0) {
244                return resId;
245            }
246        }
247
248        // Try to find main_language dictionary.
249        final String dictLanguage = MAIN_DICT_PREFIX + locale.getLanguage();
250        if ((resId = res.getIdentifier(dictLanguage, "raw", RESOURCE_PACKAGE_NAME)) != 0) {
251            return resId;
252        }
253
254        // Not found, return 0
255        return 0;
256    }
257
258    /**
259     * Returns a main dictionary resource id
260     * @param locale dictionary locale
261     * @return main dictionary resource id
262     */
263    public static int getMainDictionaryResourceId(final Resources res, final Locale locale) {
264        int resourceId = getMainDictionaryResourceIdIfAvailableForLocale(res, locale);
265        if (0 != resourceId) return resourceId;
266        return res.getIdentifier(DEFAULT_MAIN_DICT, "raw", RESOURCE_PACKAGE_NAME);
267    }
268
269    /**
270     * Returns the id associated with the main word list for a specified locale.
271     *
272     * Word lists stored in Android Keyboard's resources are referred to as the "main"
273     * word lists. Since they can be updated like any other list, we need to assign a
274     * unique ID to them. This ID is just the name of the language (locale-wise) they
275     * are for, and this method returns this ID.
276     */
277    public static String getMainDictId(final Locale locale) {
278        // This works because we don't include by default different dictionaries for
279        // different countries. This actually needs to return the id that we would
280        // like to use for word lists included in resources, and the following is okay.
281        return BinaryDictionaryGetter.MAIN_DICTIONARY_CATEGORY +
282                BinaryDictionaryGetter.ID_CATEGORY_SEPARATOR + locale.getLanguage().toString();
283    }
284
285    public static DictionaryHeader getDictionaryFileHeaderOrNull(final File file) {
286        return getDictionaryFileHeaderOrNull(file, 0, file.length());
287    }
288
289    private static DictionaryHeader getDictionaryFileHeaderOrNull(final File file,
290            final long offset, final long length) {
291        try {
292            final DictionaryHeader header =
293                    BinaryDictionaryUtils.getHeaderWithOffsetAndLength(file, offset, length);
294            return header;
295        } catch (UnsupportedFormatException e) {
296            return null;
297        } catch (IOException e) {
298            return null;
299        }
300    }
301
302    /**
303     * Returns information of the dictionary.
304     *
305     * @param fileAddress the asset dictionary file address.
306     * @return information of the specified dictionary.
307     */
308    private static DictionaryInfo createDictionaryInfoFromFileAddress(
309            final AssetFileAddress fileAddress) {
310        final DictionaryHeader header = getDictionaryFileHeaderOrNull(
311                new File(fileAddress.mFilename), fileAddress.mOffset, fileAddress.mLength);
312        if (header == null) {
313            return null;
314        }
315        final String id = header.getId();
316        final Locale locale = LocaleUtils.constructLocaleFromString(header.getLocaleString());
317        final String description = header.getDescription();
318        final String version = header.getVersion();
319        return new DictionaryInfo(id, locale, description, fileAddress, Integer.parseInt(version));
320    }
321
322    private static void addOrUpdateDictInfo(final ArrayList<DictionaryInfo> dictList,
323            final DictionaryInfo newElement) {
324        final Iterator<DictionaryInfo> iter = dictList.iterator();
325        while (iter.hasNext()) {
326            final DictionaryInfo thisDictInfo = iter.next();
327            if (thisDictInfo.mLocale.equals(newElement.mLocale)) {
328                if (newElement.mVersion <= thisDictInfo.mVersion) {
329                    return;
330                }
331                iter.remove();
332            }
333        }
334        dictList.add(newElement);
335    }
336
337    public static ArrayList<DictionaryInfo> getCurrentDictionaryFileNameAndVersionInfo(
338            final Context context) {
339        final ArrayList<DictionaryInfo> dictList = new ArrayList<>();
340
341        // Retrieve downloaded dictionaries
342        final File[] directoryList = getCachedDirectoryList(context);
343        if (null != directoryList) {
344            for (final File directory : directoryList) {
345                final String localeString = getWordListIdFromFileName(directory.getName());
346                File[] dicts = BinaryDictionaryGetter.getCachedWordLists(localeString, context);
347                for (final File dict : dicts) {
348                    final String wordListId = getWordListIdFromFileName(dict.getName());
349                    if (!DictionaryInfoUtils.isMainWordListId(wordListId)) continue;
350                    final Locale locale = LocaleUtils.constructLocaleFromString(localeString);
351                    final AssetFileAddress fileAddress = AssetFileAddress.makeFromFile(dict);
352                    final DictionaryInfo dictionaryInfo =
353                            createDictionaryInfoFromFileAddress(fileAddress);
354                    // Protect against cases of a less-specific dictionary being found, like an
355                    // en dictionary being used for an en_US locale. In this case, the en dictionary
356                    // should be used for en_US but discounted for listing purposes.
357                    if (dictionaryInfo == null || !dictionaryInfo.mLocale.equals(locale)) continue;
358                    addOrUpdateDictInfo(dictList, dictionaryInfo);
359                }
360            }
361        }
362
363        // Retrieve files from assets
364        final Resources resources = context.getResources();
365        final AssetManager assets = resources.getAssets();
366        for (final String localeString : assets.getLocales()) {
367            final Locale locale = LocaleUtils.constructLocaleFromString(localeString);
368            final int resourceId =
369                    DictionaryInfoUtils.getMainDictionaryResourceIdIfAvailableForLocale(
370                            context.getResources(), locale);
371            if (0 == resourceId) continue;
372            final AssetFileAddress fileAddress =
373                    BinaryDictionaryGetter.loadFallbackResource(context, resourceId);
374            final DictionaryInfo dictionaryInfo = createDictionaryInfoFromFileAddress(fileAddress);
375            // Protect against cases of a less-specific dictionary being found, like an
376            // en dictionary being used for an en_US locale. In this case, the en dictionary
377            // should be used for en_US but discounted for listing purposes.
378            if (!dictionaryInfo.mLocale.equals(locale)) continue;
379            addOrUpdateDictInfo(dictList, dictionaryInfo);
380        }
381
382        return dictList;
383    }
384
385    public static boolean looksValidForDictionaryInsertion(final CharSequence text,
386            final SpacingAndPunctuations spacingAndPunctuations) {
387        if (TextUtils.isEmpty(text)) return false;
388        final int length = text.length();
389        if (length > Constants.DICTIONARY_MAX_WORD_LENGTH) {
390            return false;
391        }
392        int i = 0;
393        int digitCount = 0;
394        while (i < length) {
395            final int codePoint = Character.codePointAt(text, i);
396            final int charCount = Character.charCount(codePoint);
397            i += charCount;
398            if (Character.isDigit(codePoint)) {
399                // Count digits: see below
400                digitCount += charCount;
401                continue;
402            }
403            if (!spacingAndPunctuations.isWordCodePoint(codePoint)) return false;
404        }
405        // We reject strings entirely comprised of digits to avoid using PIN codes or credit
406        // card numbers. It would come in handy for word prediction though; a good example is
407        // when writing one's address where the street number is usually quite discriminative,
408        // as well as the postal code.
409        return digitCount < length;
410    }
411}
412