1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin.dicttool;
18
19import com.android.inputmethod.latin.makedict.FusionDictionary;
20import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
21import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
22import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
23import com.android.inputmethod.latin.makedict.Word;
24
25import java.io.BufferedReader;
26import java.io.File;
27import java.io.FileNotFoundException;
28import java.io.FileReader;
29import java.io.IOException;
30import java.io.InputStream;
31import java.io.Writer;
32import java.util.ArrayList;
33import java.util.HashMap;
34import java.util.TreeSet;
35
36import javax.xml.parsers.ParserConfigurationException;
37import javax.xml.parsers.SAXParser;
38import javax.xml.parsers.SAXParserFactory;
39
40import org.xml.sax.Attributes;
41import org.xml.sax.SAXException;
42import org.xml.sax.helpers.DefaultHandler;
43
44/**
45 * Reads and writes XML files for a FusionDictionary.
46 *
47 * All functions in this class are static.
48 */
49public class XmlDictInputOutput {
50
51    private static final String ROOT_TAG = "wordlist";
52    private static final String WORD_TAG = "w";
53    private static final String BIGRAM_TAG = "bigram";
54    private static final String SHORTCUT_TAG = "shortcut";
55    private static final String FREQUENCY_ATTR = "f";
56    private static final String WORD_ATTR = "word";
57    private static final String NOT_A_WORD_ATTR = "not_a_word";
58
59    private static final String OPTIONS_KEY = "options";
60    private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing";
61    private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing";
62
63    /**
64     * SAX handler for a unigram XML file.
65     */
66    static private class UnigramHandler extends DefaultHandler {
67        // Parser states
68        private static final int START = 1;
69        private static final int WORD = 2;
70        private static final int UNKNOWN = 3;
71
72        FusionDictionary mDictionary;
73        int mState; // the state of the parser
74        int mFreq; // the currently read freq
75        String mWord; // the current word
76        final HashMap<String, ArrayList<WeightedString>> mShortcutsMap;
77
78        /**
79         * Create the handler.
80         *
81         * @param shortcuts the shortcuts as a map. This may be empty, but may not be null.
82         */
83        public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) {
84            mDictionary = null;
85            mShortcutsMap = shortcuts;
86            mWord = "";
87            mState = START;
88            mFreq = 0;
89        }
90
91        public FusionDictionary getFinalDictionary() {
92            final FusionDictionary dict = mDictionary;
93            for (final String shortcutOnly : mShortcutsMap.keySet()) {
94                if (dict.hasWord(shortcutOnly)) continue;
95                dict.add(shortcutOnly, 1, mShortcutsMap.get(shortcutOnly), true /* isNotAWord */);
96            }
97            mDictionary = null;
98            mShortcutsMap.clear();
99            mWord = "";
100            mState = START;
101            mFreq = 0;
102            return dict;
103        }
104
105        @Override
106        public void startElement(String uri, String localName, String qName, Attributes attrs) {
107            if (WORD_TAG.equals(localName)) {
108                mState = WORD;
109                mWord = "";
110                for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
111                    final String attrName = attrs.getLocalName(attrIndex);
112                    if (FREQUENCY_ATTR.equals(attrName)) {
113                        mFreq = Integer.parseInt(attrs.getValue(attrIndex));
114                    }
115                }
116            } else if (ROOT_TAG.equals(localName)) {
117                final HashMap<String, String> attributes = new HashMap<String, String>();
118                for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
119                    final String attrName = attrs.getLocalName(attrIndex);
120                    attributes.put(attrName, attrs.getValue(attrIndex));
121                }
122                final String optionsString = attributes.get(OPTIONS_KEY);
123                final boolean processUmlauts =
124                        GERMAN_UMLAUT_PROCESSING_OPTION.equals(optionsString);
125                final boolean processLigatures =
126                        FRENCH_LIGATURE_PROCESSING_OPTION.equals(optionsString);
127                mDictionary = new FusionDictionary(new PtNodeArray(),
128                        new DictionaryOptions(attributes, processUmlauts, processLigatures));
129            } else {
130                mState = UNKNOWN;
131            }
132        }
133
134        @Override
135        public void characters(char[] ch, int start, int length) {
136            if (WORD == mState) {
137                // The XML parser is free to return text in arbitrary chunks one after the
138                // other. In particular, this happens in some implementations when it finds
139                // an escape code like "&amp;".
140                mWord += String.copyValueOf(ch, start, length);
141            }
142        }
143
144        @Override
145        public void endElement(String uri, String localName, String qName) {
146            if (WORD == mState) {
147                mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), false /* isNotAWord */);
148                mState = START;
149            }
150        }
151    }
152
153    static private class AssociativeListHandler extends DefaultHandler {
154        private final String SRC_TAG;
155        private final String SRC_ATTRIBUTE;
156        private final String DST_TAG;
157        private final String DST_ATTRIBUTE;
158        private final String DST_FREQ;
159
160        // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX
161        private final static int XML_MAX = 256;
162        // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX
163        private final static int MEMORY_MAX = 256;
164        private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX;
165
166        private String mSrc;
167        private final HashMap<String, ArrayList<WeightedString>> mAssocMap;
168
169        public AssociativeListHandler(final String srcTag, final String srcAttribute,
170                final String dstTag, final String dstAttribute, final String dstFreq) {
171            SRC_TAG = srcTag;
172            SRC_ATTRIBUTE = srcAttribute;
173            DST_TAG = dstTag;
174            DST_ATTRIBUTE = dstAttribute;
175            DST_FREQ = dstFreq;
176            mSrc = null;
177            mAssocMap = new HashMap<String, ArrayList<WeightedString>>();
178        }
179
180        @Override
181        public void startElement(String uri, String localName, String qName, Attributes attrs) {
182            if (SRC_TAG.equals(localName)) {
183                mSrc = attrs.getValue(uri, SRC_ATTRIBUTE);
184            } else if (DST_TAG.equals(localName)) {
185                String dst = attrs.getValue(uri, DST_ATTRIBUTE);
186                int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ));
187                WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO);
188                ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc);
189                if (null == bigramList) bigramList = new ArrayList<WeightedString>();
190                bigramList.add(bigram);
191                mAssocMap.put(mSrc, bigramList);
192            }
193        }
194
195        protected int getValueFromFreqString(final String freqString) {
196            return Integer.parseInt(freqString);
197        }
198
199        // This may return an empty map, but will never return null.
200        public HashMap<String, ArrayList<WeightedString>> getAssocMap() {
201            return mAssocMap;
202        }
203    }
204
205    /**
206     * SAX handler for a bigram XML file.
207     */
208    static private class BigramHandler extends AssociativeListHandler {
209        private final static String BIGRAM_W1_TAG = "bi";
210        private final static String BIGRAM_W2_TAG = "w";
211        private final static String BIGRAM_W1_ATTRIBUTE = "w1";
212        private final static String BIGRAM_W2_ATTRIBUTE = "w2";
213        private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
214
215        public BigramHandler() {
216            super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE,
217                    BIGRAM_FREQ_ATTRIBUTE);
218        }
219
220        // As per getAssocMap(), this never returns null.
221        public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
222            return getAssocMap();
223        }
224    }
225
226    /**
227     * SAX handler for a shortcut & whitelist XML file.
228     */
229    static private class ShortcutAndWhitelistHandler extends AssociativeListHandler {
230        private final static String ENTRY_TAG = "entry";
231        private final static String ENTRY_ATTRIBUTE = "shortcut";
232        private final static String TARGET_TAG = "target";
233        private final static String REPLACEMENT_ATTRIBUTE = "replacement";
234        private final static String TARGET_PRIORITY_ATTRIBUTE = "priority";
235        private final static String WHITELIST_MARKER = "whitelist";
236        private final static int WHITELIST_FREQ_VALUE = 15;
237        private final static int MIN_FREQ = 0;
238        private final static int MAX_FREQ = 14;
239
240        public ShortcutAndWhitelistHandler() {
241            super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE,
242                    TARGET_PRIORITY_ATTRIBUTE);
243        }
244
245        @Override
246        protected int getValueFromFreqString(final String freqString) {
247            if (WHITELIST_MARKER.equals(freqString)) {
248                return WHITELIST_FREQ_VALUE;
249            } else {
250                final int intValue = super.getValueFromFreqString(freqString);
251                if (intValue < MIN_FREQ || intValue > MAX_FREQ) {
252                    throw new RuntimeException("Shortcut freq out of range. Accepted range is "
253                            + MIN_FREQ + ".." + MAX_FREQ);
254                }
255                return intValue;
256            }
257        }
258
259        // As per getAssocMap(), this never returns null.
260        public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() {
261            return getAssocMap();
262        }
263    }
264
265    /**
266     * Basic test to find out whether the file is in the unigram XML format or not.
267     *
268     * Concretely this only tests the header line.
269     *
270     * @param filename The name of the file to test.
271     * @return true if the file is in the unigram XML format, false otherwise
272     */
273    public static boolean isXmlUnigramDictionary(final String filename) {
274        BufferedReader reader = null;
275        try {
276            reader = new BufferedReader(new FileReader(new File(filename)));
277            final String firstLine = reader.readLine();
278            return firstLine.matches("^\\s*<wordlist .*>\\s*$");
279        } catch (FileNotFoundException e) {
280            return false;
281        } catch (IOException e) {
282            return false;
283        } finally {
284            if (reader != null) {
285                try {
286                    reader.close();
287                } catch (IOException e) {
288                    // do nothing
289                }
290            }
291        }
292    }
293
294    /**
295     * Reads a dictionary from an XML file.
296     *
297     * This is the public method that will parse an XML file and return the corresponding memory
298     * representation.
299     *
300     * @param unigrams the file to read the data from.
301     * @param shortcuts the file to read the shortcuts & whitelist from, or null.
302     * @param bigrams the file to read the bigrams from, or null.
303     * @return the in-memory representation of the dictionary.
304     */
305    public static FusionDictionary readDictionaryXml(final InputStream unigrams,
306            final InputStream shortcuts, final InputStream bigrams)
307            throws SAXException, IOException, ParserConfigurationException {
308        final SAXParserFactory factory = SAXParserFactory.newInstance();
309        factory.setNamespaceAware(true);
310        final SAXParser parser = factory.newSAXParser();
311        final BigramHandler bigramHandler = new BigramHandler();
312        if (null != bigrams) parser.parse(bigrams, bigramHandler);
313
314        final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler =
315                new ShortcutAndWhitelistHandler();
316        if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler);
317
318        final UnigramHandler unigramHandler =
319                new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap());
320        parser.parse(unigrams, unigramHandler);
321        final FusionDictionary dict = unigramHandler.getFinalDictionary();
322        final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap();
323        for (final String firstWord : bigramMap.keySet()) {
324            if (!dict.hasWord(firstWord)) continue;
325            final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord);
326            for (final WeightedString bigram : bigramList) {
327                if (!dict.hasWord(bigram.mWord)) continue;
328                dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency);
329            }
330        }
331        return dict;
332    }
333
334    /**
335     * Reads a dictionary in the first, legacy XML format
336     *
337     * This method reads data from the parser and creates a new FusionDictionary with it.
338     * The format parsed by this method is the format used before Ice Cream Sandwich,
339     * which has no support for bigrams or shortcuts/whitelist.
340     * It is important to note that this method expects the parser to have already eaten
341     * the first, all-encompassing tag.
342     *
343     * @param xpp the parser to read the data from.
344     * @return the parsed dictionary.
345     */
346
347    /**
348     * Writes a dictionary to an XML file.
349     *
350     * The output format is the "second" format, which supports bigrams and shortcuts/whitelist.
351     *
352     * @param destination a destination stream to write to.
353     * @param dict the dictionary to write.
354     */
355    public static void writeDictionaryXml(Writer destination, FusionDictionary dict)
356            throws IOException {
357        final TreeSet<Word> set = new TreeSet<Word>();
358        for (Word word : dict) {
359            set.add(word);
360        }
361        // TODO: use an XMLSerializer if this gets big
362        destination.write("<wordlist format=\"2\"");
363        final HashMap<String, String> options = dict.mOptions.mAttributes;
364        if (dict.mOptions.mGermanUmlautProcessing) {
365            destination.write(" " + OPTIONS_KEY + "=\"" + GERMAN_UMLAUT_PROCESSING_OPTION + "\"");
366        } else if (dict.mOptions.mFrenchLigatureProcessing) {
367            destination.write(" " + OPTIONS_KEY + "=\"" + FRENCH_LIGATURE_PROCESSING_OPTION + "\"");
368        }
369        for (final String key : dict.mOptions.mAttributes.keySet()) {
370            final String value = dict.mOptions.mAttributes.get(key);
371            destination.write(" " + key + "=\"" + value + "\"");
372        }
373        destination.write(">\n");
374        destination.write("<!-- Warning: there is no code to read this format yet. -->\n");
375        for (Word word : set) {
376            destination.write("  <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" "
377                    + FREQUENCY_ATTR + "=\"" + word.mFrequency
378                    + (word.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") + "\">");
379            if (null != word.mShortcutTargets) {
380                destination.write("\n");
381                for (WeightedString target : word.mShortcutTargets) {
382                    destination.write("    <" + SHORTCUT_TAG + " " + FREQUENCY_ATTR + "=\""
383                            + target.mFrequency + "\">" + target.mWord + "</" + SHORTCUT_TAG
384                            + ">\n");
385                }
386                destination.write("  ");
387            }
388            if (null != word.mBigrams) {
389                destination.write("\n");
390                for (WeightedString bigram : word.mBigrams) {
391                    destination.write("    <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\""
392                            + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n");
393                }
394                destination.write("  ");
395            }
396            destination.write("</" + WORD_TAG + ">\n");
397        }
398        destination.write("</wordlist>\n");
399        destination.close();
400    }
401}
402