1/* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.latin.dicttool; 18 19import com.android.inputmethod.latin.makedict.FusionDictionary; 20import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; 21import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 22import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 23import com.android.inputmethod.latin.makedict.Word; 24 25import java.io.BufferedReader; 26import java.io.File; 27import java.io.FileNotFoundException; 28import java.io.FileReader; 29import java.io.IOException; 30import java.io.InputStream; 31import java.io.Writer; 32import java.util.ArrayList; 33import java.util.HashMap; 34import java.util.TreeSet; 35 36import javax.xml.parsers.ParserConfigurationException; 37import javax.xml.parsers.SAXParser; 38import javax.xml.parsers.SAXParserFactory; 39 40import org.xml.sax.Attributes; 41import org.xml.sax.SAXException; 42import org.xml.sax.helpers.DefaultHandler; 43 44/** 45 * Reads and writes XML files for a FusionDictionary. 46 * 47 * All functions in this class are static. 48 */ 49public class XmlDictInputOutput { 50 51 private static final String ROOT_TAG = "wordlist"; 52 private static final String WORD_TAG = "w"; 53 private static final String BIGRAM_TAG = "bigram"; 54 private static final String SHORTCUT_TAG = "shortcut"; 55 private static final String FREQUENCY_ATTR = "f"; 56 private static final String WORD_ATTR = "word"; 57 private static final String NOT_A_WORD_ATTR = "not_a_word"; 58 59 private static final String OPTIONS_KEY = "options"; 60 private static final String GERMAN_UMLAUT_PROCESSING_OPTION = "german_umlaut_processing"; 61 private static final String FRENCH_LIGATURE_PROCESSING_OPTION = "french_ligature_processing"; 62 63 /** 64 * SAX handler for a unigram XML file. 65 */ 66 static private class UnigramHandler extends DefaultHandler { 67 // Parser states 68 private static final int START = 1; 69 private static final int WORD = 2; 70 private static final int UNKNOWN = 3; 71 72 FusionDictionary mDictionary; 73 int mState; // the state of the parser 74 int mFreq; // the currently read freq 75 String mWord; // the current word 76 final HashMap<String, ArrayList<WeightedString>> mShortcutsMap; 77 78 /** 79 * Create the handler. 80 * 81 * @param shortcuts the shortcuts as a map. This may be empty, but may not be null. 82 */ 83 public UnigramHandler(final HashMap<String, ArrayList<WeightedString>> shortcuts) { 84 mDictionary = null; 85 mShortcutsMap = shortcuts; 86 mWord = ""; 87 mState = START; 88 mFreq = 0; 89 } 90 91 public FusionDictionary getFinalDictionary() { 92 final FusionDictionary dict = mDictionary; 93 for (final String shortcutOnly : mShortcutsMap.keySet()) { 94 if (dict.hasWord(shortcutOnly)) continue; 95 dict.add(shortcutOnly, 1, mShortcutsMap.get(shortcutOnly), true /* isNotAWord */); 96 } 97 mDictionary = null; 98 mShortcutsMap.clear(); 99 mWord = ""; 100 mState = START; 101 mFreq = 0; 102 return dict; 103 } 104 105 @Override 106 public void startElement(String uri, String localName, String qName, Attributes attrs) { 107 if (WORD_TAG.equals(localName)) { 108 mState = WORD; 109 mWord = ""; 110 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { 111 final String attrName = attrs.getLocalName(attrIndex); 112 if (FREQUENCY_ATTR.equals(attrName)) { 113 mFreq = Integer.parseInt(attrs.getValue(attrIndex)); 114 } 115 } 116 } else if (ROOT_TAG.equals(localName)) { 117 final HashMap<String, String> attributes = new HashMap<String, String>(); 118 for (int attrIndex = 0; attrIndex < attrs.getLength(); ++attrIndex) { 119 final String attrName = attrs.getLocalName(attrIndex); 120 attributes.put(attrName, attrs.getValue(attrIndex)); 121 } 122 final String optionsString = attributes.get(OPTIONS_KEY); 123 final boolean processUmlauts = 124 GERMAN_UMLAUT_PROCESSING_OPTION.equals(optionsString); 125 final boolean processLigatures = 126 FRENCH_LIGATURE_PROCESSING_OPTION.equals(optionsString); 127 mDictionary = new FusionDictionary(new PtNodeArray(), 128 new DictionaryOptions(attributes, processUmlauts, processLigatures)); 129 } else { 130 mState = UNKNOWN; 131 } 132 } 133 134 @Override 135 public void characters(char[] ch, int start, int length) { 136 if (WORD == mState) { 137 // The XML parser is free to return text in arbitrary chunks one after the 138 // other. In particular, this happens in some implementations when it finds 139 // an escape code like "&". 140 mWord += String.copyValueOf(ch, start, length); 141 } 142 } 143 144 @Override 145 public void endElement(String uri, String localName, String qName) { 146 if (WORD == mState) { 147 mDictionary.add(mWord, mFreq, mShortcutsMap.get(mWord), false /* isNotAWord */); 148 mState = START; 149 } 150 } 151 } 152 153 static private class AssociativeListHandler extends DefaultHandler { 154 private final String SRC_TAG; 155 private final String SRC_ATTRIBUTE; 156 private final String DST_TAG; 157 private final String DST_ATTRIBUTE; 158 private final String DST_FREQ; 159 160 // In this version of the XML file, the bigram frequency is given as an int 0..XML_MAX 161 private final static int XML_MAX = 256; 162 // In memory and in the binary dictionary the bigram frequency is 0..MEMORY_MAX 163 private final static int MEMORY_MAX = 256; 164 private final static int XML_TO_MEMORY_RATIO = XML_MAX / MEMORY_MAX; 165 166 private String mSrc; 167 private final HashMap<String, ArrayList<WeightedString>> mAssocMap; 168 169 public AssociativeListHandler(final String srcTag, final String srcAttribute, 170 final String dstTag, final String dstAttribute, final String dstFreq) { 171 SRC_TAG = srcTag; 172 SRC_ATTRIBUTE = srcAttribute; 173 DST_TAG = dstTag; 174 DST_ATTRIBUTE = dstAttribute; 175 DST_FREQ = dstFreq; 176 mSrc = null; 177 mAssocMap = new HashMap<String, ArrayList<WeightedString>>(); 178 } 179 180 @Override 181 public void startElement(String uri, String localName, String qName, Attributes attrs) { 182 if (SRC_TAG.equals(localName)) { 183 mSrc = attrs.getValue(uri, SRC_ATTRIBUTE); 184 } else if (DST_TAG.equals(localName)) { 185 String dst = attrs.getValue(uri, DST_ATTRIBUTE); 186 int freq = getValueFromFreqString(attrs.getValue(uri, DST_FREQ)); 187 WeightedString bigram = new WeightedString(dst, freq / XML_TO_MEMORY_RATIO); 188 ArrayList<WeightedString> bigramList = mAssocMap.get(mSrc); 189 if (null == bigramList) bigramList = new ArrayList<WeightedString>(); 190 bigramList.add(bigram); 191 mAssocMap.put(mSrc, bigramList); 192 } 193 } 194 195 protected int getValueFromFreqString(final String freqString) { 196 return Integer.parseInt(freqString); 197 } 198 199 // This may return an empty map, but will never return null. 200 public HashMap<String, ArrayList<WeightedString>> getAssocMap() { 201 return mAssocMap; 202 } 203 } 204 205 /** 206 * SAX handler for a bigram XML file. 207 */ 208 static private class BigramHandler extends AssociativeListHandler { 209 private final static String BIGRAM_W1_TAG = "bi"; 210 private final static String BIGRAM_W2_TAG = "w"; 211 private final static String BIGRAM_W1_ATTRIBUTE = "w1"; 212 private final static String BIGRAM_W2_ATTRIBUTE = "w2"; 213 private final static String BIGRAM_FREQ_ATTRIBUTE = "p"; 214 215 public BigramHandler() { 216 super(BIGRAM_W1_TAG, BIGRAM_W1_ATTRIBUTE, BIGRAM_W2_TAG, BIGRAM_W2_ATTRIBUTE, 217 BIGRAM_FREQ_ATTRIBUTE); 218 } 219 220 // As per getAssocMap(), this never returns null. 221 public HashMap<String, ArrayList<WeightedString>> getBigramMap() { 222 return getAssocMap(); 223 } 224 } 225 226 /** 227 * SAX handler for a shortcut & whitelist XML file. 228 */ 229 static private class ShortcutAndWhitelistHandler extends AssociativeListHandler { 230 private final static String ENTRY_TAG = "entry"; 231 private final static String ENTRY_ATTRIBUTE = "shortcut"; 232 private final static String TARGET_TAG = "target"; 233 private final static String REPLACEMENT_ATTRIBUTE = "replacement"; 234 private final static String TARGET_PRIORITY_ATTRIBUTE = "priority"; 235 private final static String WHITELIST_MARKER = "whitelist"; 236 private final static int WHITELIST_FREQ_VALUE = 15; 237 private final static int MIN_FREQ = 0; 238 private final static int MAX_FREQ = 14; 239 240 public ShortcutAndWhitelistHandler() { 241 super(ENTRY_TAG, ENTRY_ATTRIBUTE, TARGET_TAG, REPLACEMENT_ATTRIBUTE, 242 TARGET_PRIORITY_ATTRIBUTE); 243 } 244 245 @Override 246 protected int getValueFromFreqString(final String freqString) { 247 if (WHITELIST_MARKER.equals(freqString)) { 248 return WHITELIST_FREQ_VALUE; 249 } else { 250 final int intValue = super.getValueFromFreqString(freqString); 251 if (intValue < MIN_FREQ || intValue > MAX_FREQ) { 252 throw new RuntimeException("Shortcut freq out of range. Accepted range is " 253 + MIN_FREQ + ".." + MAX_FREQ); 254 } 255 return intValue; 256 } 257 } 258 259 // As per getAssocMap(), this never returns null. 260 public HashMap<String, ArrayList<WeightedString>> getShortcutAndWhitelistMap() { 261 return getAssocMap(); 262 } 263 } 264 265 /** 266 * Basic test to find out whether the file is in the unigram XML format or not. 267 * 268 * Concretely this only tests the header line. 269 * 270 * @param filename The name of the file to test. 271 * @return true if the file is in the unigram XML format, false otherwise 272 */ 273 public static boolean isXmlUnigramDictionary(final String filename) { 274 BufferedReader reader = null; 275 try { 276 reader = new BufferedReader(new FileReader(new File(filename))); 277 final String firstLine = reader.readLine(); 278 return firstLine.matches("^\\s*<wordlist .*>\\s*$"); 279 } catch (FileNotFoundException e) { 280 return false; 281 } catch (IOException e) { 282 return false; 283 } finally { 284 if (reader != null) { 285 try { 286 reader.close(); 287 } catch (IOException e) { 288 // do nothing 289 } 290 } 291 } 292 } 293 294 /** 295 * Reads a dictionary from an XML file. 296 * 297 * This is the public method that will parse an XML file and return the corresponding memory 298 * representation. 299 * 300 * @param unigrams the file to read the data from. 301 * @param shortcuts the file to read the shortcuts & whitelist from, or null. 302 * @param bigrams the file to read the bigrams from, or null. 303 * @return the in-memory representation of the dictionary. 304 */ 305 public static FusionDictionary readDictionaryXml(final InputStream unigrams, 306 final InputStream shortcuts, final InputStream bigrams) 307 throws SAXException, IOException, ParserConfigurationException { 308 final SAXParserFactory factory = SAXParserFactory.newInstance(); 309 factory.setNamespaceAware(true); 310 final SAXParser parser = factory.newSAXParser(); 311 final BigramHandler bigramHandler = new BigramHandler(); 312 if (null != bigrams) parser.parse(bigrams, bigramHandler); 313 314 final ShortcutAndWhitelistHandler shortcutAndWhitelistHandler = 315 new ShortcutAndWhitelistHandler(); 316 if (null != shortcuts) parser.parse(shortcuts, shortcutAndWhitelistHandler); 317 318 final UnigramHandler unigramHandler = 319 new UnigramHandler(shortcutAndWhitelistHandler.getShortcutAndWhitelistMap()); 320 parser.parse(unigrams, unigramHandler); 321 final FusionDictionary dict = unigramHandler.getFinalDictionary(); 322 final HashMap<String, ArrayList<WeightedString>> bigramMap = bigramHandler.getBigramMap(); 323 for (final String firstWord : bigramMap.keySet()) { 324 if (!dict.hasWord(firstWord)) continue; 325 final ArrayList<WeightedString> bigramList = bigramMap.get(firstWord); 326 for (final WeightedString bigram : bigramList) { 327 if (!dict.hasWord(bigram.mWord)) continue; 328 dict.setBigram(firstWord, bigram.mWord, bigram.mFrequency); 329 } 330 } 331 return dict; 332 } 333 334 /** 335 * Reads a dictionary in the first, legacy XML format 336 * 337 * This method reads data from the parser and creates a new FusionDictionary with it. 338 * The format parsed by this method is the format used before Ice Cream Sandwich, 339 * which has no support for bigrams or shortcuts/whitelist. 340 * It is important to note that this method expects the parser to have already eaten 341 * the first, all-encompassing tag. 342 * 343 * @param xpp the parser to read the data from. 344 * @return the parsed dictionary. 345 */ 346 347 /** 348 * Writes a dictionary to an XML file. 349 * 350 * The output format is the "second" format, which supports bigrams and shortcuts/whitelist. 351 * 352 * @param destination a destination stream to write to. 353 * @param dict the dictionary to write. 354 */ 355 public static void writeDictionaryXml(Writer destination, FusionDictionary dict) 356 throws IOException { 357 final TreeSet<Word> set = new TreeSet<Word>(); 358 for (Word word : dict) { 359 set.add(word); 360 } 361 // TODO: use an XMLSerializer if this gets big 362 destination.write("<wordlist format=\"2\""); 363 final HashMap<String, String> options = dict.mOptions.mAttributes; 364 if (dict.mOptions.mGermanUmlautProcessing) { 365 destination.write(" " + OPTIONS_KEY + "=\"" + GERMAN_UMLAUT_PROCESSING_OPTION + "\""); 366 } else if (dict.mOptions.mFrenchLigatureProcessing) { 367 destination.write(" " + OPTIONS_KEY + "=\"" + FRENCH_LIGATURE_PROCESSING_OPTION + "\""); 368 } 369 for (final String key : dict.mOptions.mAttributes.keySet()) { 370 final String value = dict.mOptions.mAttributes.get(key); 371 destination.write(" " + key + "=\"" + value + "\""); 372 } 373 destination.write(">\n"); 374 destination.write("<!-- Warning: there is no code to read this format yet. -->\n"); 375 for (Word word : set) { 376 destination.write(" <" + WORD_TAG + " " + WORD_ATTR + "=\"" + word.mWord + "\" " 377 + FREQUENCY_ATTR + "=\"" + word.mFrequency 378 + (word.mIsNotAWord ? "\" " + NOT_A_WORD_ATTR + "=\"true" : "") + "\">"); 379 if (null != word.mShortcutTargets) { 380 destination.write("\n"); 381 for (WeightedString target : word.mShortcutTargets) { 382 destination.write(" <" + SHORTCUT_TAG + " " + FREQUENCY_ATTR + "=\"" 383 + target.mFrequency + "\">" + target.mWord + "</" + SHORTCUT_TAG 384 + ">\n"); 385 } 386 destination.write(" "); 387 } 388 if (null != word.mBigrams) { 389 destination.write("\n"); 390 for (WeightedString bigram : word.mBigrams) { 391 destination.write(" <" + BIGRAM_TAG + " " + FREQUENCY_ATTR + "=\"" 392 + bigram.mFrequency + "\">" + bigram.mWord + "</" + BIGRAM_TAG + ">\n"); 393 } 394 destination.write(" "); 395 } 396 destination.write("</" + WORD_TAG + ">\n"); 397 } 398 destination.write("</wordlist>\n"); 399 destination.close(); 400 } 401} 402