1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin.dicttool;
18
19import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils;
20import com.android.inputmethod.latin.makedict.BinaryDictIOUtils;
21import com.android.inputmethod.latin.makedict.DictDecoder;
22import com.android.inputmethod.latin.makedict.DictEncoder;
23import com.android.inputmethod.latin.makedict.FormatSpec;
24import com.android.inputmethod.latin.makedict.FusionDictionary;
25import com.android.inputmethod.latin.makedict.MakedictLog;
26import com.android.inputmethod.latin.makedict.UnsupportedFormatException;
27import com.android.inputmethod.latin.makedict.Ver2DictEncoder;
28import com.android.inputmethod.latin.makedict.Ver4DictEncoder;
29
30import org.xml.sax.SAXException;
31
32import java.io.BufferedInputStream;
33import java.io.BufferedReader;
34import java.io.BufferedWriter;
35import java.io.File;
36import java.io.FileInputStream;
37import java.io.FileNotFoundException;
38import java.io.FileWriter;
39import java.io.IOException;
40import java.io.InputStream;
41import java.io.InputStreamReader;
42import java.util.Arrays;
43import java.util.LinkedList;
44
45import javax.xml.parsers.ParserConfigurationException;
46
47/**
48 * Main class/method for DictionaryMaker.
49 */
50public class DictionaryMaker {
51
52    static class Arguments {
53        private static final String OPTION_VERSION_2 = "-2";
54        private static final String OPTION_VERSION_4 = "-4";
55        private static final String OPTION_INPUT_SOURCE = "-s";
56        private static final String OPTION_INPUT_BIGRAM_XML = "-b";
57        private static final String OPTION_INPUT_SHORTCUT_XML = "-c";
58        private static final String OPTION_OUTPUT_BINARY = "-d";
59        private static final String OPTION_OUTPUT_XML = "-x";
60        private static final String OPTION_OUTPUT_COMBINED = "-o";
61        private static final String OPTION_HELP = "-h";
62        public final String mInputBinary;
63        public final String mInputCombined;
64        public final String mInputUnigramXml;
65        public final String mInputShortcutXml;
66        public final String mInputBigramXml;
67        public final String mOutputBinary;
68        public final String mOutputXml;
69        public final String mOutputCombined;
70        public final int mOutputBinaryFormatVersion;
71
72        private void checkIntegrity() throws IOException {
73            checkHasExactlyOneInput();
74            checkHasAtLeastOneOutput();
75            checkNotSameFile(mInputBinary, mOutputBinary);
76            checkNotSameFile(mInputBinary, mOutputXml);
77            checkNotSameFile(mInputCombined, mOutputBinary);
78            checkNotSameFile(mInputCombined, mOutputXml);
79            checkNotSameFile(mInputUnigramXml, mOutputBinary);
80            checkNotSameFile(mInputUnigramXml, mOutputXml);
81            checkNotSameFile(mInputUnigramXml, mOutputCombined);
82            checkNotSameFile(mInputShortcutXml, mOutputBinary);
83            checkNotSameFile(mInputShortcutXml, mOutputXml);
84            checkNotSameFile(mInputShortcutXml, mOutputCombined);
85            checkNotSameFile(mInputBigramXml, mOutputBinary);
86            checkNotSameFile(mInputBigramXml, mOutputXml);
87            checkNotSameFile(mInputBigramXml, mOutputCombined);
88            checkNotSameFile(mOutputBinary, mOutputXml);
89            checkNotSameFile(mOutputBinary, mOutputCombined);
90            checkNotSameFile(mOutputXml, mOutputCombined);
91        }
92
93        private void checkHasExactlyOneInput() {
94            if (null == mInputUnigramXml && null == mInputBinary && null == mInputCombined) {
95                throw new RuntimeException("No input file specified");
96            } else if ((null != mInputUnigramXml && null != mInputBinary)
97                    || (null != mInputUnigramXml && null != mInputCombined)
98                    || (null != mInputBinary && null != mInputCombined)) {
99                throw new RuntimeException("Several input files specified");
100            } else if ((null != mInputBinary || null != mInputCombined)
101                    && (null != mInputBigramXml || null != mInputShortcutXml)) {
102                throw new RuntimeException("Separate bigrams/shortcut files are only supported"
103                        + " with XML input (other formats include bigrams and shortcuts already)");
104            }
105        }
106
107        private void checkHasAtLeastOneOutput() {
108            if (null == mOutputBinary && null == mOutputXml && null == mOutputCombined) {
109                throw new RuntimeException("No output specified");
110            }
111        }
112
113        /**
114         * Utility method that throws an exception if path1 and path2 point to the same file.
115         */
116        private static void checkNotSameFile(final String path1, final String path2)
117                throws IOException {
118            if (null == path1 || null == path2) return;
119            if (new File(path1).getCanonicalPath().equals(new File(path2).getCanonicalPath())) {
120                throw new RuntimeException(path1 + " and " + path2 + " are the same file: "
121                        + " refusing to process.");
122            }
123        }
124
125        private void displayHelp() {
126            MakedictLog.i(getHelp());
127        }
128
129        public static String getHelp() {
130            return "Usage: makedict "
131                    + "[-s <unigrams.xml> [-b <bigrams.xml>] [-c <shortcuts_and_whitelist.xml>] "
132                    + "| [-s <combined format input]"
133                    + "| [-s <binary input>] [-d <binary output>] [-x <xml output>] "
134                    + " [-o <combined output>]"
135                    + "[-2] [-3] [-4]\n"
136                    + "\n"
137                    + "  Converts a source dictionary file to one or several outputs.\n"
138                    + "  Source can be an XML file, with an optional XML bigrams file, or a\n"
139                    + "  binary dictionary file.\n"
140                    + "  Binary version 2 (Jelly Bean), 3, 4, XML and\n"
141                    + "  combined format outputs are supported.";
142        }
143
144        public Arguments(String[] argsArray) throws IOException {
145            final LinkedList<String> args = new LinkedList<>(Arrays.asList(argsArray));
146            if (args.isEmpty()) {
147                displayHelp();
148            }
149            String inputBinary = null;
150            String inputCombined = null;
151            String inputUnigramXml = null;
152            String inputShortcutXml = null;
153            String inputBigramXml = null;
154            String outputBinary = null;
155            String outputXml = null;
156            String outputCombined = null;
157            int outputBinaryFormatVersion = 2; // the default version is 2.
158
159            while (!args.isEmpty()) {
160                final String arg = args.get(0);
161                args.remove(0);
162                if (arg.charAt(0) == '-') {
163                    if (OPTION_VERSION_2.equals(arg)) {
164                        // Do nothing, this is the default
165                    } else if (OPTION_VERSION_4.equals(arg)) {
166                        outputBinaryFormatVersion = FormatSpec.VERSION4;
167                    } else if (OPTION_HELP.equals(arg)) {
168                        displayHelp();
169                    } else {
170                        // All these options need an argument
171                        if (args.isEmpty()) {
172                            throw new IllegalArgumentException("Option " + arg + " is unknown or "
173                                    + "requires an argument");
174                        }
175                        String filename = args.get(0);
176                        args.remove(0);
177                        if (OPTION_INPUT_SOURCE.equals(arg)) {
178                            if (XmlDictInputOutput.isXmlUnigramDictionary(filename)) {
179                                inputUnigramXml = filename;
180                            } else if (CombinedInputOutput.isCombinedDictionary(filename)) {
181                                inputCombined = filename;
182                            } else if (BinaryDictDecoderUtils.isBinaryDictionary(filename)) {
183                                inputBinary = filename;
184                            } else {
185                                throw new IllegalArgumentException(
186                                        "Unknown format for file " + filename);
187                            }
188                        } else if (OPTION_INPUT_SHORTCUT_XML.equals(arg)) {
189                            inputShortcutXml = filename;
190                        } else if (OPTION_INPUT_BIGRAM_XML.equals(arg)) {
191                            inputBigramXml = filename;
192                        } else if (OPTION_OUTPUT_BINARY.equals(arg)) {
193                            outputBinary = filename;
194                        } else if (OPTION_OUTPUT_XML.equals(arg)) {
195                            outputXml = filename;
196                        } else if (OPTION_OUTPUT_COMBINED.equals(arg)) {
197                            outputCombined = filename;
198                        } else {
199                            throw new IllegalArgumentException("Unknown option : " + arg);
200                        }
201                    }
202                } else {
203                    if (null == inputBinary && null == inputUnigramXml) {
204                        if (BinaryDictDecoderUtils.isBinaryDictionary(arg)) {
205                            inputBinary = arg;
206                        } else if (CombinedInputOutput.isCombinedDictionary(arg)) {
207                            inputCombined = arg;
208                        } else {
209                            inputUnigramXml = arg;
210                        }
211                    } else if (null == outputBinary) {
212                        outputBinary = arg;
213                    } else {
214                        throw new IllegalArgumentException("Several output binary files specified");
215                    }
216                }
217            }
218
219            mInputBinary = inputBinary;
220            mInputCombined = inputCombined;
221            mInputUnigramXml = inputUnigramXml;
222            mInputShortcutXml = inputShortcutXml;
223            mInputBigramXml = inputBigramXml;
224            mOutputBinary = outputBinary;
225            mOutputXml = outputXml;
226            mOutputCombined = outputCombined;
227            mOutputBinaryFormatVersion = outputBinaryFormatVersion;
228            checkIntegrity();
229        }
230    }
231
232    public static void main(String[] args)
233            throws FileNotFoundException, ParserConfigurationException, SAXException, IOException,
234            UnsupportedFormatException {
235        final Arguments parsedArgs = new Arguments(args);
236        FusionDictionary dictionary = readInputFromParsedArgs(parsedArgs);
237        writeOutputToParsedArgs(parsedArgs, dictionary);
238    }
239
240    /**
241     * Invoke the right input method according to args.
242     *
243     * @param args the parsed command line arguments.
244     * @return the read dictionary.
245     */
246    private static FusionDictionary readInputFromParsedArgs(final Arguments args)
247            throws IOException, UnsupportedFormatException, ParserConfigurationException,
248            SAXException, FileNotFoundException {
249        if (null != args.mInputBinary) {
250            return readBinaryFile(args.mInputBinary);
251        } else if (null != args.mInputCombined) {
252            return readCombinedFile(args.mInputCombined);
253        } else if (null != args.mInputUnigramXml) {
254            return readXmlFile(args.mInputUnigramXml, args.mInputShortcutXml, args.mInputBigramXml);
255        } else {
256            throw new RuntimeException("No input file specified");
257        }
258    }
259
260    /**
261     * Read a dictionary from the name of a binary file.
262     *
263     * @param binaryFilename the name of the file in the binary dictionary format.
264     * @return the read dictionary.
265     * @throws FileNotFoundException if the file can't be found
266     * @throws IOException if the input file can't be read
267     * @throws UnsupportedFormatException if the binary file is not in the expected format
268     */
269    private static FusionDictionary readBinaryFile(final String binaryFilename)
270            throws FileNotFoundException, IOException, UnsupportedFormatException {
271        final File file = new File(binaryFilename);
272        final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length());
273        return dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */);
274    }
275
276    /**
277     * Read a dictionary from the name of a combined file.
278     *
279     * @param combinedFilename the name of the file in the combined format.
280     * @return the read dictionary.
281     * @throws FileNotFoundException if the file can't be found
282     * @throws IOException if the input file can't be read
283     */
284    private static FusionDictionary readCombinedFile(final String combinedFilename)
285        throws FileNotFoundException, IOException {
286        try (final BufferedReader reader = new BufferedReader(new InputStreamReader(
287                new FileInputStream(combinedFilename), "UTF-8"))
288        ) {
289            return CombinedInputOutput.readDictionaryCombined(reader);
290        }
291    }
292
293    private static BufferedInputStream getBufferedFileInputStream(final String filename)
294            throws FileNotFoundException {
295        if (filename == null) {
296            return null;
297        }
298        return new BufferedInputStream(new FileInputStream(filename));
299    }
300
301    /**
302     * Read a dictionary from a unigram XML file, and optionally a bigram XML file.
303     *
304     * @param unigramXmlFilename the name of the unigram XML file. May not be null.
305     * @param shortcutXmlFilename the name of the shortcut/whitelist XML file, or null if none.
306     * @param bigramXmlFilename the name of the bigram XML file. Pass null if there are no bigrams.
307     * @return the read dictionary.
308     * @throws FileNotFoundException if one of the files can't be found
309     * @throws SAXException if one or more of the XML files is not well-formed
310     * @throws IOException if one the input files can't be read
311     * @throws ParserConfigurationException if the system can't create a SAX parser
312     */
313    private static FusionDictionary readXmlFile(final String unigramXmlFilename,
314            final String shortcutXmlFilename, final String bigramXmlFilename)
315            throws FileNotFoundException, SAXException, IOException, ParserConfigurationException {
316        try (
317            final BufferedInputStream unigrams = getBufferedFileInputStream(unigramXmlFilename);
318            final BufferedInputStream shortcuts = getBufferedFileInputStream(shortcutXmlFilename);
319            final BufferedInputStream bigrams = getBufferedFileInputStream(bigramXmlFilename);
320        ) {
321            return XmlDictInputOutput.readDictionaryXml(unigrams, shortcuts, bigrams);
322        }
323    }
324
325    /**
326     * Invoke the right output method according to args.
327     *
328     * This will write the passed dictionary to the file(s) passed in the command line arguments.
329     * @param args the parsed arguments.
330     * @param dict the file to output.
331     * @throws FileNotFoundException if one of the output files can't be created.
332     * @throws IOException if one of the output files can't be written to.
333     */
334    private static void writeOutputToParsedArgs(final Arguments args, final FusionDictionary dict)
335            throws FileNotFoundException, IOException, UnsupportedFormatException,
336            IllegalArgumentException {
337        if (null != args.mOutputBinary) {
338            writeBinaryDictionary(args.mOutputBinary, dict, args.mOutputBinaryFormatVersion);
339        }
340        if (null != args.mOutputXml) {
341            writeXmlDictionary(args.mOutputXml, dict);
342        }
343        if (null != args.mOutputCombined) {
344            writeCombinedDictionary(args.mOutputCombined, dict);
345        }
346    }
347
348    /**
349     * Write the dictionary in binary format to the specified filename.
350     *
351     * @param outputFilename the name of the file to write to.
352     * @param dict the dictionary to write.
353     * @param version the binary format version to use.
354     * @throws FileNotFoundException if the output file can't be created.
355     * @throws IOException if the output file can't be written to.
356     */
357    private static void writeBinaryDictionary(final String outputFilename,
358            final FusionDictionary dict, final int version)
359            throws FileNotFoundException, IOException, UnsupportedFormatException {
360        final File outputFile = new File(outputFilename);
361        final FormatSpec.FormatOptions formatOptions = new FormatSpec.FormatOptions(version);
362        final DictEncoder dictEncoder;
363        if (version == FormatSpec.VERSION4) {
364            dictEncoder = new Ver4DictEncoder(outputFile);
365        } else {
366            dictEncoder = new Ver2DictEncoder(outputFile);
367        }
368        dictEncoder.writeDictionary(dict, formatOptions);
369    }
370
371    /**
372     * Write the dictionary in XML format to the specified filename.
373     *
374     * @param outputFilename the name of the file to write to.
375     * @param dict the dictionary to write.
376     * @throws FileNotFoundException if the output file can't be created.
377     * @throws IOException if the output file can't be written to.
378     */
379    private static void writeXmlDictionary(final String outputFilename,
380            final FusionDictionary dict) throws FileNotFoundException, IOException {
381        try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) {
382            XmlDictInputOutput.writeDictionaryXml(writer, dict);
383        }
384    }
385
386    /**
387     * Write the dictionary in the combined format to the specified filename.
388     *
389     * @param outputFilename the name of the file to write to.
390     * @param dict the dictionary to write.
391     * @throws FileNotFoundException if the output file can't be created.
392     * @throws IOException if the output file can't be written to.
393     */
394    private static void writeCombinedDictionary(final String outputFilename,
395            final FusionDictionary dict) throws FileNotFoundException, IOException {
396        try (final BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilename))) {
397            CombinedInputOutput.writeDictionaryCombined(writer, dict);
398        }
399    }
400}
401