1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin;
18
19import com.android.inputmethod.latin.FusionDictionary.WeightedString;
20
21import java.io.IOException;
22import java.io.InputStream;
23import java.io.Writer;
24import java.util.ArrayList;
25import java.util.HashMap;
26import java.util.TreeSet;
27
28import javax.xml.parsers.ParserConfigurationException;
29import javax.xml.parsers.SAXParser;
30import javax.xml.parsers.SAXParserFactory;
31
32import org.xml.sax.Attributes;
33import org.xml.sax.SAXException;
34import org.xml.sax.helpers.DefaultHandler;
35
36/**
37 * Reads and writes XML files for a FusionDictionary.
38 *
39 * All functions in this class are static.
40 */
41public class XmlDictInputOutput {
42
43    private static final String WORD_TAG = "w";
44    private static final String BIGRAM_TAG = "bigram";
45    private static final String FREQUENCY_ATTR = "f";
46    private static final String WORD_ATTR = "word";
47
48    /**
49     * SAX handler for a unigram XML file.
50     */
51    static private class UnigramHandler extends DefaultHandler {
52        // Parser states
53        private static final int NONE = 0;
54        private static final int START = 1;
55        private static final int WORD = 2;
56        private static final int BIGRAM = 4;
57        private static final int END = 5;
58        private static final int UNKNOWN = 6;
59
60        final FusionDictionary mDictionary;
61        int mState; // the state of the parser
62        int mFreq; // the currently read freq
63        String mWord; // the current word
64        final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
65
66        /**
67         * Create the handler.
68         *
69         * @param dict the dictionary to construct.
70         * @param bigrams the bigrams as a map. This may be empty, but may not be null.
71         */
72        public UnigramHandler(FusionDictionary dict,
73                HashMap<String, ArrayList<WeightedString>> bigrams) {
74            mDictionary = dict;
75            mBigramsMap = bigrams;
76            mWord = "";
77            mState = START;
78            mFreq = 0;
79        }
80
81        @Override
82        public void startElement(String uri, String localName, String qName, Attributes attrs) {
83            if (WORD_TAG.equals(localName)) {
84                mState = WORD;
85                mWord = "";
86                for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) {
87                    final String attrName = attrs.getLocalName(attrIndex);
88                    if (FREQUENCY_ATTR.equals(attrName)) {
89                        mFreq = Integer.parseInt(attrs.getValue(attrIndex));
90                    }
91                }
92            } else {
93                mState = UNKNOWN;
94            }
95        }
96
97        @Override
98        public void characters(char[] ch, int start, int length) {
99            if (WORD == mState) {
100                // The XML parser is free to return text in arbitrary chunks one after the
101                // other. In particular, this happens in some implementations when it finds
102                // an escape code like "&amp;".
103                mWord += String.copyValueOf(ch, start, length);
104            }
105        }
106
107        @Override
108        public void endElement(String uri, String localName, String qName) {
109            if (WORD == mState) {
110                mDictionary.add(mWord, mFreq, mBigramsMap.get(mWord));
111                mState = START;
112            }
113        }
114    }
115
116    /**
117     * SAX handler for a bigram XML file.
118     */
119    static private class BigramHandler extends DefaultHandler {
120        private final static String BIGRAM_W1_TAG = "bi";
121        private final static String BIGRAM_W2_TAG = "w";
122        private final static String BIGRAM_W1_ATTRIBUTE = "w1";
123        private final static String BIGRAM_W2_ATTRIBUTE = "w2";
124        private final static String BIGRAM_FREQ_ATTRIBUTE = "p";
125
126        String mW1;
127        final HashMap<String, ArrayList<WeightedString>> mBigramsMap;
128
129        public BigramHandler() {
130            mW1 = null;
131            mBigramsMap = new HashMap<String, ArrayList<WeightedString>>();
132        }
133
134        @Override
135        public void startElement(String uri, String localName, String qName, Attributes attrs) {
136            if (BIGRAM_W1_TAG.equals(localName)) {
137                mW1 = attrs.getValue(uri, BIGRAM_W1_ATTRIBUTE);
138            } else if (BIGRAM_W2_TAG.equals(localName)) {
139                String w2 = attrs.getValue(uri, BIGRAM_W2_ATTRIBUTE);
140                int freq = Integer.parseInt(attrs.getValue(uri, BIGRAM_FREQ_ATTRIBUTE));
141                WeightedString bigram = new WeightedString(w2, freq / 8);
142                ArrayList<WeightedString> bigramList = mBigramsMap.get(mW1);
143                if (null == bigramList) bigramList = new ArrayList<WeightedString>();
144                bigramList.add(bigram);
145                mBigramsMap.put(mW1, bigramList);
146            }
147        }
148
149        public HashMap<String, ArrayList<WeightedString>> getBigramMap() {
150            return mBigramsMap;
151        }
152    }
153
154    /**
155     * Reads a dictionary from an XML file.
156     *
157     * This is the public method that will parse an XML file and return the corresponding memory
158     * representation.
159     *
160     * @param unigrams the file to read the data from.
161     * @return the in-memory representation of the dictionary.
162     */
163    public static FusionDictionary readDictionaryXml(InputStream unigrams, InputStream bigrams)
164            throws SAXException, IOException, ParserConfigurationException {
165        final SAXParserFactory factory = SAXParserFactory.newInstance();
166        factory.setNamespaceAware(true);
167        final SAXParser parser = factory.newSAXParser();
168        final BigramHandler bigramHandler = new BigramHandler();
169        if (null != bigrams) parser.parse(bigrams, bigramHandler);
170
171        final FusionDictionary dict = new FusionDictionary();
172        final UnigramHandler unigramHandler =
173                new UnigramHandler(dict, bigramHandler.getBigramMap());
174        parser.parse(unigrams, unigramHandler);
175        return dict;
176    }
177
178    /**
179     * Reads a dictionary in the first, legacy XML format
180     *
181     * This method reads data from the parser and creates a new FusionDictionary with it.
182     * The format parsed by this method is the format used before Ice Cream Sandwich,
183     * which has no support for bigrams or shortcuts.
184     * It is important to note that this method expects the parser to have already eaten
185     * the first, all-encompassing tag.
186     *
187     * @param xpp the parser to read the data from.
188     * @return the parsed dictionary.
189     */
190
191    /**
192     * Writes a dictionary to an XML file.
193     *
194     * The output format is the "second" format, which supports bigrams and shortcuts.
195     *
196     * @param destination a destination stream to write to.
197     * @param dict the dictionary to write.
198     */
199    public static void writeDictionaryXml(Writer destination, FusionDictionary dict)
200            throws IOException {
201        final TreeSet<Word> set = new TreeSet<Word>();
202        for (Word word : dict) {
203            set.add(word);
204        }
205        // TODO: use an XMLSerializer if this gets big
206        destination.write("<wordlist format=\"2\">\n");
207        for (Word word : set) {
208            destination.write("  <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" "
209                    + FREQUENCY_ATTR + "=\"" + word.mFrequency + "\">");
210            if (null != word.mBigrams) {
211                destination.write("\n");
212                for (WeightedString bigram : word.mBigrams) {
213                    destination.write("    <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\""
214                            + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n");
215                }
216                destination.write("  ");
217            }
218            destination.write("</" + WORD_TAG + ">\n");
219        }
220        destination.write("</wordlist>\n");
221        destination.close();
222    }
223}
224