BinaryDictionaryGetter.java revision 66c90cd2ae49c49da8aeda5ab1d86bd9b76434c7
1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import com.android.inputmethod.latin.makedict.BinaryDictInputOutput;
20import com.android.inputmethod.latin.makedict.FormatSpec;
21
22import android.content.Context;
23import android.content.SharedPreferences;
24import android.content.pm.PackageManager.NameNotFoundException;
25import android.content.res.AssetFileDescriptor;
26import android.util.Log;
27
28import java.io.File;
29import java.io.FileInputStream;
30import java.io.IOException;
31import java.nio.BufferUnderflowException;
32import java.nio.channels.FileChannel;
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.Locale;
36
37/**
38 * Helper class to get the address of a mmap'able dictionary file.
39 */
40final class BinaryDictionaryGetter {
41
42    /**
43     * Used for Log actions from this class
44     */
45    private static final String TAG = BinaryDictionaryGetter.class.getSimpleName();
46
47    /**
48     * Used to return empty lists
49     */
50    private static final File[] EMPTY_FILE_ARRAY = new File[0];
51
52    /**
53     * Name of the common preferences name to know which word list are on and which are off.
54     */
55    private static final String COMMON_PREFERENCES_NAME = "LatinImeDictPrefs";
56
57    // Name of the category for the main dictionary
58    private static final String MAIN_DICTIONARY_CATEGORY = "main";
59    public static final String ID_CATEGORY_SEPARATOR = ":";
60
61    // The key considered to read the version attribute in a dictionary file.
62    private static String VERSION_KEY = "version";
63
64    // Prevents this from being instantiated
65    private BinaryDictionaryGetter() {}
66
67    /**
68     * Returns whether we may want to use this character as part of a file name.
69     *
70     * This basically only accepts ascii letters and numbers, and rejects everything else.
71     */
72    private static boolean isFileNameCharacter(int codePoint) {
73        if (codePoint >= 0x30 && codePoint <= 0x39) return true; // Digit
74        if (codePoint >= 0x41 && codePoint <= 0x5A) return true; // Uppercase
75        if (codePoint >= 0x61 && codePoint <= 0x7A) return true; // Lowercase
76        return codePoint == '_'; // Underscore
77    }
78
79    /**
80     * Escapes a string for any characters that may be suspicious for a file or directory name.
81     *
82     * Concretely this does a sort of URL-encoding except it will encode everything that's not
83     * alphanumeric or underscore. (true URL-encoding leaves alone characters like '*', which
84     * we cannot allow here)
85     */
86    // TODO: create a unit test for this method
87    private static String replaceFileNameDangerousCharacters(final String name) {
88        // This assumes '%' is fully available as a non-separator, normal
89        // character in a file name. This is probably true for all file systems.
90        final StringBuilder sb = new StringBuilder();
91        final int nameLength = name.length();
92        for (int i = 0; i < nameLength; i = name.offsetByCodePoints(i, 1)) {
93            final int codePoint = name.codePointAt(i);
94            if (isFileNameCharacter(codePoint)) {
95                sb.appendCodePoint(codePoint);
96            } else {
97                // 6 digits - unicode is limited to 21 bits
98                sb.append(String.format((Locale)null, "%%%1$06x", codePoint));
99            }
100        }
101        return sb.toString();
102    }
103
104    /**
105     * Reverse escaping done by replaceFileNameDangerousCharacters.
106     */
107    private static String getWordListIdFromFileName(final String fname) {
108        final StringBuilder sb = new StringBuilder();
109        final int fnameLength = fname.length();
110        for (int i = 0; i < fnameLength; i = fname.offsetByCodePoints(i, 1)) {
111            final int codePoint = fname.codePointAt(i);
112            if ('%' != codePoint) {
113                sb.appendCodePoint(codePoint);
114            } else {
115                final int encodedCodePoint = Integer.parseInt(fname.substring(i + 1, i + 7), 16);
116                i += 6;
117                sb.appendCodePoint(encodedCodePoint);
118            }
119        }
120        return sb.toString();
121    }
122
123    /**
124     * Helper method to get the top level cache directory.
125     */
126    private static String getWordListCacheDirectory(final Context context) {
127        return context.getFilesDir() + File.separator + "dicts";
128    }
129
130    /**
131     * Find out the cache directory associated with a specific locale.
132     */
133    private static String getCacheDirectoryForLocale(final String locale, final Context context) {
134        final String relativeDirectoryName = replaceFileNameDangerousCharacters(locale);
135        final String absoluteDirectoryName = getWordListCacheDirectory(context) + File.separator
136                + relativeDirectoryName;
137        final File directory = new File(absoluteDirectoryName);
138        if (!directory.exists()) {
139            if (!directory.mkdirs()) {
140                Log.e(TAG, "Could not create the directory for locale" + locale);
141            }
142        }
143        return absoluteDirectoryName;
144    }
145
146    /**
147     * Generates a file name for the id and locale passed as an argument.
148     *
149     * In the current implementation the file name returned will always be unique for
150     * any id/locale pair, but please do not expect that the id can be the same for
151     * different dictionaries with different locales. An id should be unique for any
152     * dictionary.
153     * The file name is pretty much an URL-encoded version of the id inside a directory
154     * named like the locale, except it will also escape characters that look dangerous
155     * to some file systems.
156     * @param id the id of the dictionary for which to get a file name
157     * @param locale the locale for which to get the file name as a string
158     * @param context the context to use for getting the directory
159     * @return the name of the file to be created
160     */
161    public static String getCacheFileName(String id, String locale, Context context) {
162        final String fileName = replaceFileNameDangerousCharacters(id);
163        return getCacheDirectoryForLocale(locale, context) + File.separator + fileName;
164    }
165
166    /**
167     * Generates a unique temporary file name in the app cache directory.
168     *
169     * This is unique as long as it doesn't get called twice in the same millisecond by the same
170     * thread, which should be more than enough for our purposes.
171     */
172    public static String getTempFileName(String id, Context context) {
173        final String fileName = replaceFileNameDangerousCharacters(id);
174        return context.getCacheDir() + File.separator + fileName + "."
175                + Thread.currentThread().getId() + "." + System.currentTimeMillis();
176    }
177
178    /**
179     * Returns a file address from a resource, or null if it cannot be opened.
180     */
181    private static AssetFileAddress loadFallbackResource(final Context context,
182            final int fallbackResId) {
183        final AssetFileDescriptor afd = context.getResources().openRawResourceFd(fallbackResId);
184        if (afd == null) {
185            Log.e(TAG, "Found the resource but cannot read it. Is it compressed? resId="
186                    + fallbackResId);
187            return null;
188        }
189        return AssetFileAddress.makeFromFileNameAndOffset(
190                context.getApplicationInfo().sourceDir, afd.getStartOffset(), afd.getLength());
191    }
192
193    private static final class DictPackSettings {
194        final SharedPreferences mDictPreferences;
195        public DictPackSettings(final Context context) {
196            Context dictPackContext = null;
197            try {
198                final String dictPackName =
199                        context.getString(R.string.dictionary_pack_package_name);
200                dictPackContext = context.createPackageContext(dictPackName, 0);
201            } catch (NameNotFoundException e) {
202                // The dictionary pack is not installed...
203                // TODO: fallback on the built-in dict, see the TODO above
204                Log.e(TAG, "Could not find a dictionary pack");
205            }
206            mDictPreferences = null == dictPackContext ? null
207                    : dictPackContext.getSharedPreferences(COMMON_PREFERENCES_NAME,
208                            Context.MODE_WORLD_READABLE | Context.MODE_MULTI_PROCESS);
209        }
210        public boolean isWordListActive(final String dictId) {
211            if (null == mDictPreferences) {
212                // If we don't have preferences it basically means we can't find the dictionary
213                // pack - either it's not installed, or it's disabled, or there is some strange
214                // bug. Either way, a word list with no settings should be on by default: default
215                // dictionaries in LatinIME are on if there is no settings at all, and if for some
216                // reason some dictionaries have been installed BUT the dictionary pack can't be
217                // found anymore it's safer to actually supply installed dictionaries.
218                return true;
219            } else {
220                // The default is true here for the same reasons as above. We got the dictionary
221                // pack but if we don't have any settings for it it means the user has never been
222                // to the settings yet. So by default, the main dictionaries should be on.
223                return mDictPreferences.getBoolean(dictId, true);
224            }
225        }
226    }
227
228    /**
229     * Helper method to the list of cache directories, one for each distinct locale.
230     */
231    private static File[] getCachedDirectoryList(final Context context) {
232        return new File(getWordListCacheDirectory(context)).listFiles();
233    }
234
235    /**
236     * Returns the category for a given file name.
237     *
238     * This parses the file name, extracts the category, and returns it. See
239     * {@link #getMainDictId(Locale)} and {@link #isMainWordListId(String)}.
240     * @return The category as a string or null if it can't be found in the file name.
241     */
242    private static String getCategoryFromFileName(final String fileName) {
243        final String id = getWordListIdFromFileName(fileName);
244        final String[] idArray = id.split(ID_CATEGORY_SEPARATOR);
245        if (2 != idArray.length) return null;
246        return idArray[0];
247    }
248
249    /**
250     * Utility class for the {@link #getCachedWordLists} method
251     */
252    private static final class FileAndMatchLevel {
253        final File mFile;
254        final int mMatchLevel;
255        public FileAndMatchLevel(final File file, final int matchLevel) {
256            mFile = file;
257            mMatchLevel = matchLevel;
258        }
259    }
260
261    /**
262     * Returns the list of cached files for a specific locale, one for each category.
263     *
264     * This will return exactly one file for each word list category that matches
265     * the passed locale. If several files match the locale for any given category,
266     * this returns the file with the closest match to the locale. For example, if
267     * the passed word list is en_US, and for a category we have an en and an en_US
268     * word list available, we'll return only the en_US one.
269     * Thus, the list will contain as many files as there are categories.
270     *
271     * @param locale the locale to find the dictionary files for, as a string.
272     * @param context the context on which to open the files upon.
273     * @return an array of binary dictionary files, which may be empty but may not be null.
274     */
275    private static File[] getCachedWordLists(final String locale,
276            final Context context) {
277        final File[] directoryList = getCachedDirectoryList(context);
278        if (null == directoryList) return EMPTY_FILE_ARRAY;
279        final HashMap<String, FileAndMatchLevel> cacheFiles = CollectionUtils.newHashMap();
280        for (File directory : directoryList) {
281            if (!directory.isDirectory()) continue;
282            final String dirLocale = getWordListIdFromFileName(directory.getName());
283            final int matchLevel = LocaleUtils.getMatchLevel(dirLocale, locale);
284            if (LocaleUtils.isMatch(matchLevel)) {
285                final File[] wordLists = directory.listFiles();
286                if (null != wordLists) {
287                    for (File wordList : wordLists) {
288                        final String category = getCategoryFromFileName(wordList.getName());
289                        final FileAndMatchLevel currentBestMatch = cacheFiles.get(category);
290                        if (null == currentBestMatch || currentBestMatch.mMatchLevel < matchLevel) {
291                            cacheFiles.put(category, new FileAndMatchLevel(wordList, matchLevel));
292                        }
293                    }
294                }
295            }
296        }
297        if (cacheFiles.isEmpty()) return EMPTY_FILE_ARRAY;
298        final File[] result = new File[cacheFiles.size()];
299        int index = 0;
300        for (final FileAndMatchLevel entry : cacheFiles.values()) {
301            result[index++] = entry.mFile;
302        }
303        return result;
304    }
305
306    /**
307     * Remove all files with the passed id, except the passed file.
308     *
309     * If a dictionary with a given ID has a metadata change that causes it to change
310     * path, we need to remove the old version. The only way to do this is to check all
311     * installed files for a matching ID in a different directory.
312     */
313    public static void removeFilesWithIdExcept(final Context context, final String id,
314            final File fileToKeep) {
315        try {
316            final File canonicalFileToKeep = fileToKeep.getCanonicalFile();
317            final File[] directoryList = getCachedDirectoryList(context);
318            if (null == directoryList) return;
319            for (File directory : directoryList) {
320                // There is one directory per locale. See #getCachedDirectoryList
321                if (!directory.isDirectory()) continue;
322                final File[] wordLists = directory.listFiles();
323                if (null == wordLists) continue;
324                for (File wordList : wordLists) {
325                    final String fileId = getWordListIdFromFileName(wordList.getName());
326                    if (fileId.equals(id)) {
327                        if (!canonicalFileToKeep.equals(wordList.getCanonicalFile())) {
328                            wordList.delete();
329                        }
330                    }
331                }
332            }
333        } catch (java.io.IOException e) {
334            Log.e(TAG, "IOException trying to cleanup files : " + e);
335        }
336    }
337
338
339    /**
340     * Returns the id associated with the main word list for a specified locale.
341     *
342     * Word lists stored in Android Keyboard's resources are referred to as the "main"
343     * word lists. Since they can be updated like any other list, we need to assign a
344     * unique ID to them. This ID is just the name of the language (locale-wise) they
345     * are for, and this method returns this ID.
346     */
347    private static String getMainDictId(final Locale locale) {
348        // This works because we don't include by default different dictionaries for
349        // different countries. This actually needs to return the id that we would
350        // like to use for word lists included in resources, and the following is okay.
351        return MAIN_DICTIONARY_CATEGORY + ID_CATEGORY_SEPARATOR + locale.getLanguage().toString();
352    }
353
354    private static boolean isMainWordListId(final String id) {
355        final String[] idArray = id.split(ID_CATEGORY_SEPARATOR);
356        if (2 != idArray.length) return false;
357        return MAIN_DICTIONARY_CATEGORY.equals(idArray[0]);
358    }
359
360    // ## HACK ## we prevent usage of a dictionary before version 18 for English only. The reason
361    // for this is, since those do not include whitelist entries, the new code with an old version
362    // of the dictionary would lose whitelist functionality.
363    private static boolean hackCanUseDictionaryFile(final Locale locale, final File f) {
364        // Only for English - other languages didn't have a whitelist, hence this
365        // ad-hoc ## HACK ##
366        if (!Locale.ENGLISH.getLanguage().equals(locale.getLanguage())) return true;
367
368        FileInputStream inStream = null;
369        try {
370            // Read the version of the file
371            inStream = new FileInputStream(f);
372            final BinaryDictInputOutput.ByteBufferWrapper buffer =
373                    new BinaryDictInputOutput.ByteBufferWrapper(inStream.getChannel().map(
374                            FileChannel.MapMode.READ_ONLY, 0, f.length()));
375            final int magic = buffer.readInt();
376            if (magic != FormatSpec.VERSION_2_MAGIC_NUMBER) {
377                return false;
378            }
379            final int formatVersion = buffer.readInt();
380            final int headerSize = buffer.readInt();
381            final HashMap<String, String> options = CollectionUtils.newHashMap();
382            BinaryDictInputOutput.populateOptions(buffer, headerSize, options);
383
384            final String version = options.get(VERSION_KEY);
385            if (null == version) {
386                // No version in the options : the format is unexpected
387                return false;
388            }
389            // Version 18 is the first one to include the whitelist
390            // Obviously this is a big ## HACK ##
391            return Integer.parseInt(version) >= 18;
392        } catch (java.io.FileNotFoundException e) {
393            return false;
394        } catch (java.io.IOException e) {
395            return false;
396        } catch (NumberFormatException e) {
397            return false;
398        } catch (BufferUnderflowException e) {
399            return false;
400        } finally {
401            if (inStream != null) {
402                try {
403                    inStream.close();
404                } catch (IOException e) {
405                    // do nothing
406                }
407            }
408        }
409    }
410
411    /**
412     * Returns a list of file addresses for a given locale, trying relevant methods in order.
413     *
414     * Tries to get binary dictionaries from various sources, in order:
415     * - Uses a content provider to get a public dictionary set, as per the protocol described
416     *   in BinaryDictionaryFileDumper.
417     * If that fails:
418     * - Gets a file name from the built-in dictionary for this locale, if any.
419     * If that fails:
420     * - Returns null.
421     * @return The list of addresses of valid dictionary files, or null.
422     */
423    public static ArrayList<AssetFileAddress> getDictionaryFiles(final Locale locale,
424            final Context context) {
425
426        final boolean hasDefaultWordList = DictionaryFactory.isDictionaryAvailable(context, locale);
427        // cacheWordListsFromContentProvider returns the list of files it copied to local
428        // storage, but we don't really care about what was copied NOW: what we want is the
429        // list of everything we ever cached, so we ignore the return value.
430        BinaryDictionaryFileDumper.cacheWordListsFromContentProvider(locale, context,
431                hasDefaultWordList);
432        final File[] cachedWordLists = getCachedWordLists(locale.toString(), context);
433        final String mainDictId = getMainDictId(locale);
434        final DictPackSettings dictPackSettings = new DictPackSettings(context);
435
436        boolean foundMainDict = false;
437        final ArrayList<AssetFileAddress> fileList = CollectionUtils.newArrayList();
438        // cachedWordLists may not be null, see doc for getCachedDictionaryList
439        for (final File f : cachedWordLists) {
440            final String wordListId = getWordListIdFromFileName(f.getName());
441            final boolean canUse = f.canRead() && hackCanUseDictionaryFile(locale, f);
442            if (canUse && isMainWordListId(wordListId)) {
443                foundMainDict = true;
444            }
445            if (!dictPackSettings.isWordListActive(wordListId)) continue;
446            if (canUse) {
447                fileList.add(AssetFileAddress.makeFromFileName(f.getPath()));
448            } else {
449                Log.e(TAG, "Found a cached dictionary file but cannot read or use it");
450            }
451        }
452
453        if (!foundMainDict && dictPackSettings.isWordListActive(mainDictId)) {
454            final int fallbackResId =
455                    DictionaryFactory.getMainDictionaryResourceId(context.getResources(), locale);
456            final AssetFileAddress fallbackAsset = loadFallbackResource(context, fallbackResId);
457            if (null != fallbackAsset) {
458                fileList.add(fallbackAsset);
459            }
460        }
461
462        return fileList;
463    }
464}
465