BinaryDictIOUtils.java revision 7f438aa12f683ac15c8c8e60ca852412d00128db
1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.latin.makedict; 18 19import com.android.inputmethod.latin.Constants; 20import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding; 21import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; 22import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; 23import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 24import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; 25import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 26 27import java.io.IOException; 28import java.io.OutputStream; 29import java.util.ArrayList; 30import java.util.Iterator; 31import java.util.List; 32import java.util.Map; 33import java.util.Stack; 34 35public final class BinaryDictIOUtils { 36 private static final boolean DBG = false; 37 38 private static final class Position { 39 public static final int NOT_READ_GROUPCOUNT = -1; 40 41 public int mAddress; 42 public int mNumOfCharGroup; 43 public int mPosition; 44 public int mLength; 45 46 public Position(int address, int length) { 47 mAddress = address; 48 mLength = length; 49 mNumOfCharGroup = NOT_READ_GROUPCOUNT; 50 } 51 } 52 53 /** 54 * Tours all node without recursive call. 55 */ 56 private static void readUnigramsAndBigramsBinaryInner( 57 final FusionDictionaryBufferInterface buffer, final int headerSize, 58 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 59 final Map<Integer, ArrayList<PendingAttribute>> bigrams, 60 final FormatOptions formatOptions) { 61 int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; 62 63 Stack<Position> stack = new Stack<Position>(); 64 int index = 0; 65 66 Position initPos = new Position(headerSize, 0); 67 stack.push(initPos); 68 69 while (!stack.empty()) { 70 Position p = stack.peek(); 71 72 if (DBG) { 73 MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" + 74 p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength); 75 } 76 77 if (buffer.position() != p.mAddress) buffer.position(p.mAddress); 78 if (index != p.mLength) index = p.mLength; 79 80 if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) { 81 p.mNumOfCharGroup = BinaryDictInputOutput.readCharGroupCount(buffer); 82 p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup); 83 p.mPosition = 0; 84 } 85 if (p.mNumOfCharGroup == 0) { 86 stack.pop(); 87 continue; 88 } 89 CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer, 90 p.mAddress - headerSize, formatOptions); 91 for (int i = 0; i < info.mCharacters.length; ++i) { 92 pushedChars[index++] = info.mCharacters[i]; 93 } 94 p.mPosition++; 95 96 final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags, 97 formatOptions); 98 if (!isMovedGroup 99 && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word 100 words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); 101 frequencies.put(info.mOriginalAddress, info.mFrequency); 102 if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); 103 } 104 105 if (p.mPosition == p.mNumOfCharGroup) { 106 if (formatOptions.mSupportsDynamicUpdate) { 107 final int forwardLinkAddress = buffer.readUnsignedInt24(); 108 if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) { 109 // the node has a forward link. 110 p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT; 111 p.mAddress = forwardLinkAddress; 112 } else { 113 stack.pop(); 114 } 115 } else { 116 stack.pop(); 117 } 118 } else { 119 // the node has more groups. 120 p.mAddress = buffer.position(); 121 } 122 123 if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) { 124 Position childrenPos = new Position(info.mChildrenAddress + headerSize, index); 125 stack.push(childrenPos); 126 } 127 } 128 } 129 130 /** 131 * Reads unigrams and bigrams from the binary file. 132 * Doesn't make the memory representation of the dictionary. 133 * 134 * @param buffer the buffer to read. 135 * @param words the map to store the address as a key and the word as a value. 136 * @param frequencies the map to store the address as a key and the frequency as a value. 137 * @param bigrams the map to store the address as a key and the list of address as a value. 138 * @throws IOException 139 * @throws UnsupportedFormatException 140 */ 141 public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer, 142 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 143 final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, 144 UnsupportedFormatException { 145 // Read header 146 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 147 readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams, 148 header.mFormatOptions); 149 } 150 151 /** 152 * Gets the address of the last CharGroup of the exact matching word in the dictionary. 153 * If no match is found, returns NOT_VALID_WORD. 154 * 155 * @param buffer the buffer to read. 156 * @param word the word we search for. 157 * @return the address of the terminal node. 158 * @throws IOException 159 * @throws UnsupportedFormatException 160 */ 161 public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer, 162 final String word) throws IOException, UnsupportedFormatException { 163 if (word == null) return FormatSpec.NOT_VALID_WORD; 164 if (buffer.position() != 0) buffer.position(0); 165 166 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 167 int wordPos = 0; 168 final int wordLen = word.codePointCount(0, word.length()); 169 for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { 170 if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; 171 172 do { 173 int groupOffset = buffer.position() - header.mHeaderSize; 174 final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); 175 groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount); 176 177 boolean foundNextCharGroup = false; 178 for (int i = 0; i < charGroupCount; ++i) { 179 final int charGroupPos = buffer.position(); 180 final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, 181 buffer.position(), header.mFormatOptions); 182 if (BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, 183 header.mFormatOptions)) { 184 continue; 185 } 186 boolean same = true; 187 for (int p = 0, j = word.offsetByCodePoints(0, wordPos); 188 p < currentInfo.mCharacters.length; 189 ++p, j = word.offsetByCodePoints(j, 1)) { 190 if (wordPos + p >= wordLen 191 || word.codePointAt(j) != currentInfo.mCharacters[p]) { 192 same = false; 193 break; 194 } 195 } 196 197 if (same) { 198 // found the group matches the word. 199 if (wordPos + currentInfo.mCharacters.length == wordLen) { 200 if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) { 201 return FormatSpec.NOT_VALID_WORD; 202 } else { 203 return charGroupPos; 204 } 205 } 206 wordPos += currentInfo.mCharacters.length; 207 if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 208 return FormatSpec.NOT_VALID_WORD; 209 } 210 foundNextCharGroup = true; 211 buffer.position(currentInfo.mChildrenAddress); 212 break; 213 } 214 groupOffset = currentInfo.mEndAddress; 215 } 216 217 // If we found the next char group, it is under the file pointer. 218 // But if not, we are at the end of this node so we expect to have 219 // a forward link address that we need to consult and possibly resume 220 // search on the next node in the linked list. 221 if (foundNextCharGroup) break; 222 if (!header.mFormatOptions.mSupportsDynamicUpdate) { 223 return FormatSpec.NOT_VALID_WORD; 224 } 225 226 final int forwardLinkAddress = buffer.readUnsignedInt24(); 227 if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 228 return FormatSpec.NOT_VALID_WORD; 229 } 230 buffer.position(forwardLinkAddress); 231 } while(true); 232 } 233 return FormatSpec.NOT_VALID_WORD; 234 } 235 236 /** 237 * Delete the word from the binary file. 238 * 239 * @param buffer the buffer to write. 240 * @param word the word we delete 241 * @throws IOException 242 * @throws UnsupportedFormatException 243 */ 244 public static void deleteWord(final FusionDictionaryBufferInterface buffer, 245 final String word) throws IOException, UnsupportedFormatException { 246 buffer.position(0); 247 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 248 final int wordPosition = getTerminalPosition(buffer, word); 249 if (wordPosition == FormatSpec.NOT_VALID_WORD) return; 250 251 buffer.position(wordPosition); 252 final int flags = buffer.readUnsignedByte(); 253 final int newFlags = flags ^ FormatSpec.FLAG_IS_TERMINAL; 254 buffer.position(wordPosition); 255 buffer.put((byte)newFlags); 256 } 257 258 /** 259 * @return the size written, in bytes. Always 3 bytes. 260 */ 261 private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer, 262 final int value) { 263 final int absValue = Math.abs(value); 264 buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); 265 buffer.put((byte)((absValue >> 8) & 0xFF)); 266 buffer.put((byte)(absValue & 0xFF)); 267 return 3; 268 } 269 270 /** 271 * @return the size written, in bytes. Always 3 bytes. 272 */ 273 private static int writeSInt24ToStream(final OutputStream destination, final int value) 274 throws IOException { 275 final int absValue = Math.abs(value); 276 destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); 277 destination.write((byte)((absValue >> 8) & 0xFF)); 278 destination.write((byte)(absValue & 0xFF)); 279 return 3; 280 } 281 282 /** 283 * @return the size written, in bytes. 1, 2, or 3 bytes. 284 */ 285 private static int writeVariableAddress(final OutputStream destination, final int value) 286 throws IOException { 287 switch (BinaryDictInputOutput.getByteSize(value)) { 288 case 1: 289 destination.write((byte)value); 290 break; 291 case 2: 292 destination.write((byte)(0xFF & (value >> 8))); 293 destination.write((byte)(0xFF & value)); 294 break; 295 case 3: 296 destination.write((byte)(0xFF & (value >> 16))); 297 destination.write((byte)(0xFF & (value >> 8))); 298 destination.write((byte)(0xFF & value)); 299 break; 300 } 301 return BinaryDictInputOutput.getByteSize(value); 302 } 303 304 /** 305 * Update a parent address in a CharGroup that is addressed by groupOriginAddress. 306 * 307 * @param buffer the buffer to write. 308 * @param groupOriginAddress the address of the group. 309 * @param newParentAddress the absolute address of the parent. 310 * @param formatOptions file format options. 311 */ 312 public static void updateParentAddress(final FusionDictionaryBufferInterface buffer, 313 final int groupOriginAddress, final int newParentAddress, 314 final FormatOptions formatOptions) { 315 final int originalPosition = buffer.position(); 316 buffer.position(groupOriginAddress); 317 if (!formatOptions.mSupportsDynamicUpdate) { 318 throw new RuntimeException("this file format does not support parent addresses"); 319 } 320 final int flags = buffer.readUnsignedByte(); 321 final int parentOffset = newParentAddress - groupOriginAddress; 322 writeSInt24ToBuffer(buffer, parentOffset); 323 buffer.position(originalPosition); 324 } 325 326 private static void skipString(final FusionDictionaryBufferInterface buffer, 327 final boolean hasMultipleChars) { 328 if (hasMultipleChars) { 329 int character = CharEncoding.readChar(buffer); 330 while (character != FormatSpec.INVALID_CHARACTER) { 331 character = CharEncoding.readChar(buffer); 332 } 333 } else { 334 CharEncoding.readChar(buffer); 335 } 336 } 337 338 /** 339 * Write a string to a stream. 340 * 341 * @param destination the stream to write. 342 * @param word the string to be written. 343 * @return the size written, in bytes. 344 * @throws IOException 345 */ 346 private static int writeString(final OutputStream destination, final String word) 347 throws IOException { 348 int size = 0; 349 final int length = word.length(); 350 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 351 final int codePoint = word.codePointAt(i); 352 if (CharEncoding.getCharSize(codePoint) == 1) { 353 destination.write((byte)codePoint); 354 size++; 355 } else { 356 destination.write((byte)(0xFF & (codePoint >> 16))); 357 destination.write((byte)(0xFF & (codePoint >> 8))); 358 destination.write((byte)(0xFF & codePoint)); 359 size += 3; 360 } 361 } 362 destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); 363 size++; 364 return size; 365 } 366 367 /** 368 * Update a children address in a CharGroup that is addressed by groupOriginAddress. 369 * 370 * @param buffer the buffer to write. 371 * @param groupOriginAddress the address of the group. 372 * @param newChildrenAddress the absolute address of the child. 373 * @param formatOptions file format options. 374 */ 375 public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer, 376 final int groupOriginAddress, final int newChildrenAddress, 377 final FormatOptions formatOptions) { 378 final int originalPosition = buffer.position(); 379 buffer.position(groupOriginAddress); 380 final int flags = buffer.readUnsignedByte(); 381 final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions); 382 skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); 383 if ((FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte(); 384 final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS 385 ? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position(); 386 writeSInt24ToBuffer(buffer, childrenOffset); 387 buffer.position(originalPosition); 388 } 389 390 /** 391 * Write a char group to an output stream. 392 * A char group is an in-memory representation of a node in trie. 393 * A char group info is an on-disk representation of a node. 394 * 395 * @param destination the stream to write. 396 * @param info the char group info to be written. 397 * @return the size written, in bytes. 398 */ 399 public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info) 400 throws IOException { 401 int size = 1; 402 destination.write((byte)info.mFlags); 403 final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ? 404 FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress; 405 size += writeSInt24ToStream(destination, parentOffset); 406 407 for (int i = 0; i < info.mCharacters.length; ++i) { 408 if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) { 409 destination.write((byte)info.mCharacters[i]); 410 size++; 411 } else { 412 size += writeSInt24ToStream(destination, info.mCharacters[i]); 413 } 414 } 415 if (info.mCharacters.length > 1) { 416 destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR); 417 size++; 418 } 419 420 if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { 421 destination.write((byte)info.mFrequency); 422 size++; 423 } 424 425 final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ? 426 0 : info.mChildrenAddress - info.mOriginalAddress; 427 writeSInt24ToStream(destination, childrenOffset); 428 429 if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) { 430 final int shortcutListSize = 431 BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets); 432 destination.write((byte)(shortcutListSize >> 8)); 433 destination.write((byte)(shortcutListSize & 0xFF)); 434 size += 2; 435 final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator(); 436 while (shortcutIterator.hasNext()) { 437 final WeightedString target = shortcutIterator.next(); 438 destination.write((byte)BinaryDictInputOutput.makeShortcutFlags( 439 shortcutIterator.hasNext(), target.mFrequency)); 440 size++; 441 size += writeString(destination, target.mWord); 442 } 443 } 444 445 if (info.mBigrams != null) { 446 // TODO: Consolidate this code with the code that computes the size of the bigram list 447 // in BinaryDictionaryInputOutput#computeActualNodeSize 448 for (int i = 0; i < info.mBigrams.size(); ++i) { 449 final int bigramOffset = info.mBigrams.get(i).mAddress - info.mOriginalAddress; 450 final int bigramFrequency = info.mBigrams.get(i).mFrequency; 451 int bigramFlags = (i < info.mBigrams.size() - 1) 452 ? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0; 453 bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0; 454 switch (BinaryDictInputOutput.getByteSize(bigramOffset)) { 455 case 1: 456 bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; 457 break; 458 case 2: 459 bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; 460 break; 461 case 3: 462 bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; 463 break; 464 } 465 bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY; 466 destination.write((byte)bigramFlags); 467 size++; 468 size += writeVariableAddress(destination, Math.abs(bigramOffset)); 469 } 470 } 471 return size; 472 } 473} 474