BinaryDictInputOutput.java revision 8d031a63b4d59d4d8670b4310dd9e18a0e03435a
1/* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.latin.makedict; 18 19import com.android.inputmethod.latin.Constants; 20import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; 21import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; 22import com.android.inputmethod.latin.makedict.FusionDictionary.Node; 23import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 24 25import java.io.ByteArrayOutputStream; 26import java.io.File; 27import java.io.FileInputStream; 28import java.io.FileNotFoundException; 29import java.io.IOException; 30import java.io.OutputStream; 31import java.nio.ByteBuffer; 32import java.nio.channels.FileChannel; 33import java.util.ArrayList; 34import java.util.Arrays; 35import java.util.HashMap; 36import java.util.Iterator; 37import java.util.Map; 38import java.util.Stack; 39import java.util.TreeMap; 40 41/** 42 * Reads and writes XML files for a FusionDictionary. 43 * 44 * All the methods in this class are static. 45 */ 46public class BinaryDictInputOutput { 47 48 private static final boolean DBG = MakedictLog.DBG; 49 50 /* 51 * Array of Node(FusionDictionary.Node) layout is as follows: 52 * 53 * g | 54 * r | the number of groups, 1 or 2 bytes. 55 * o | 1 byte = bbbbbbbb match 56 * u | case 1xxxxxxx => xxxxxxx << 8 + next byte 57 * p | otherwise => bbbbbbbb 58 * c | 59 * ount 60 * 61 * g | 62 * r | sequence of groups, 63 * o | the layout of each group is described below. 64 * u | 65 * ps 66 * 67 */ 68 69 /* Node(CharGroup) layout is as follows: 70 * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE 71 * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 72 * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE 73 * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES 74 * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES 75 * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS 76 * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL 77 * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS 78 * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS 79 * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD 80 * | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED 81 * 82 * p | 83 * a | IF HAS_PARENT_ADDRESS (defined in the file header) 84 * r | parent address, 3byte 85 * e | the address must be negative, so the absolute value of the address is stored. 86 * n | 87 * taddress 88 * 89 * c | IF FLAG_HAS_MULTIPLE_CHARS 90 * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers 91 * a | end 1 byte, = 0 92 * r | ELSE 93 * s | char 1 or 3 bytes 94 * | END 95 * 96 * f | 97 * r | IF FLAG_IS_TERMINAL 98 * e | frequency 1 byte 99 * q | 100 * 101 * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType 102 * h | // nothing 103 * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType 104 * l | children address, 1 byte 105 * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType 106 * r | children address, 2 bytes 107 * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType 108 * n | children address, 3 bytes 109 * A | END 110 * d 111 * dress 112 * 113 * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS 114 * | shortcut string list 115 * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS 116 * | bigrams address list 117 * 118 * Char format is: 119 * 1 byte = bbbbbbbb match 120 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 121 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 122 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 123 * 00011111 would be outside unicode. 124 * else: iso-latin-1 code 125 * This allows for the whole unicode range to be encoded, including chars outside of 126 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 127 * characters which should never happen anyway (and still work, but take 3 bytes). 128 * 129 * bigram address list is: 130 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 131 * | addressSign = 1 bit, : FLAG_ATTRIBUTE_OFFSET_NEGATIVE 132 * | 1 = must take -address, 0 = must take +address 133 * | xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE 134 * | addressFormat = 2 bits, 00 = unused : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 135 * | 01 = 1 byte : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 136 * | 10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES 137 * | 11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES 138 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 139 * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat) 140 * | read 1 byte, add top 4 bits 141 * | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat) 142 * | read 2 bytes, add top 4 bits 143 * | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat 144 * | read 3 bytes, add top 4 bits 145 * | END 146 * | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address 147 * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is 148 * 149 * shortcut string list is: 150 * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes. 151 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 152 * | reserved = 3 bits, must be 0 153 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 154 * <shortcut> = | string of characters at the char format described above, with the terminator 155 * | used to signal the end of the string. 156 * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags 157 */ 158 159 private static final int VERSION_1_MAGIC_NUMBER = 0x78B1; 160 public static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; 161 private static final int MINIMUM_SUPPORTED_VERSION = 1; 162 private static final int MAXIMUM_SUPPORTED_VERSION = 3; 163 private static final int NOT_A_VERSION_NUMBER = -1; 164 private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2; 165 private static final int FIRST_VERSION_WITH_PARENT_ADDRESS = 3; 166 167 // These options need to be the same numeric values as the one in the native reading code. 168 private static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; 169 private static final int HAS_PARENT_ADDRESS = 0x2; 170 private static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; 171 private static final int CONTAINS_BIGRAMS_FLAG = 0x8; 172 173 // TODO: Make this value adaptative to content data, store it in the header, and 174 // use it in the reading code. 175 private static final int MAX_WORD_LENGTH = Constants.Dictionary.MAX_WORD_LENGTH; 176 177 private static final int PARENT_ADDRESS_SIZE = 3; 178 179 private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; 180 private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; 181 private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; 182 private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; 183 private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; 184 185 private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; 186 187 private static final int FLAG_IS_TERMINAL = 0x10; 188 private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; 189 private static final int FLAG_HAS_BIGRAMS = 0x04; 190 private static final int FLAG_IS_NOT_A_WORD = 0x02; 191 private static final int FLAG_IS_BLACKLISTED = 0x01; 192 193 private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; 194 private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; 195 private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; 196 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; 197 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; 198 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; 199 private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F; 200 201 private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F; 202 203 private static final int GROUP_TERMINATOR_SIZE = 1; 204 private static final int GROUP_FLAGS_SIZE = 1; 205 private static final int GROUP_FREQUENCY_SIZE = 1; 206 private static final int GROUP_MAX_ADDRESS_SIZE = 3; 207 private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1; 208 private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; 209 private static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2; 210 211 private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; 212 private static final int NO_PARENT_ADDRESS = 0; 213 private static final int INVALID_CHARACTER = -1; 214 215 private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 216 private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 217 218 private static final int MAX_TERMINAL_FREQUENCY = 255; 219 private static final int MAX_BIGRAM_FREQUENCY = 15; 220 221 // Arbitrary limit to how much passes we consider address size compression should 222 // terminate in. At the time of this writing, our largest dictionary completes 223 // compression in five passes. 224 // If the number of passes exceeds this number, makedict bails with an exception on 225 // suspicion that a bug might be causing an infinite loop. 226 private static final int MAX_PASSES = 24; 227 228 public interface FusionDictionaryBufferInterface { 229 public int readUnsignedByte(); 230 public int readUnsignedShort(); 231 public int readUnsignedInt24(); 232 public int readInt(); 233 public int position(); 234 public void position(int newPosition); 235 public void put(final byte b); 236 } 237 238 public static final class ByteBufferWrapper implements FusionDictionaryBufferInterface { 239 private ByteBuffer mBuffer; 240 241 public ByteBufferWrapper(final ByteBuffer buffer) { 242 mBuffer = buffer; 243 } 244 245 @Override 246 public int readUnsignedByte() { 247 return ((int)mBuffer.get()) & 0xFF; 248 } 249 250 @Override 251 public int readUnsignedShort() { 252 return ((int)mBuffer.getShort()) & 0xFFFF; 253 } 254 255 @Override 256 public int readUnsignedInt24() { 257 final int retval = readUnsignedByte(); 258 return (retval << 16) + readUnsignedShort(); 259 } 260 261 @Override 262 public int readInt() { 263 return mBuffer.getInt(); 264 } 265 266 @Override 267 public int position() { 268 return mBuffer.position(); 269 } 270 271 @Override 272 public void position(int newPos) { 273 mBuffer.position(newPos); 274 } 275 276 @Override 277 public void put(final byte b) { 278 mBuffer.put(b); 279 } 280 } 281 282 /** 283 * Options about file format. 284 */ 285 public static class FormatOptions { 286 public final int mVersion; 287 public final boolean mHasParentAddress; 288 public FormatOptions(final int version) { 289 this(version, false); 290 } 291 public FormatOptions(final int version, final boolean hasParentAddress) { 292 mVersion = version; 293 if (version < FIRST_VERSION_WITH_PARENT_ADDRESS && hasParentAddress) { 294 throw new RuntimeException("Parent addresses are only supported with versions " 295 + FIRST_VERSION_WITH_PARENT_ADDRESS + " and ulterior."); 296 } 297 mHasParentAddress = hasParentAddress; 298 } 299 } 300 301 /** 302 * Class representing file header. 303 */ 304 private static final class FileHeader { 305 public final int mHeaderSize; 306 public final DictionaryOptions mDictionaryOptions; 307 public final FormatOptions mFormatOptions; 308 public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions, 309 final FormatOptions formatOptions) { 310 mHeaderSize = headerSize; 311 mDictionaryOptions = dictionaryOptions; 312 mFormatOptions = formatOptions; 313 } 314 } 315 316 /** 317 * A class grouping utility function for our specific character encoding. 318 */ 319 private static class CharEncoding { 320 321 private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 322 private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; 323 324 /** 325 * Helper method to find out whether this code fits on one byte 326 */ 327 private static boolean fitsOnOneByte(final int character) { 328 return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE 329 && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; 330 } 331 332 /** 333 * Compute the size of a character given its character code. 334 * 335 * Char format is: 336 * 1 byte = bbbbbbbb match 337 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 338 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 339 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 340 * 00011111 would be outside unicode. 341 * else: iso-latin-1 code 342 * This allows for the whole unicode range to be encoded, including chars outside of 343 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 344 * characters which should never happen anyway (and still work, but take 3 bytes). 345 * 346 * @param character the character code. 347 * @return the size in binary encoded-form, either 1 or 3 bytes. 348 */ 349 private static int getCharSize(final int character) { 350 // See char encoding in FusionDictionary.java 351 if (fitsOnOneByte(character)) return 1; 352 if (INVALID_CHARACTER == character) return 1; 353 return 3; 354 } 355 356 /** 357 * Compute the byte size of a character array. 358 */ 359 private static int getCharArraySize(final int[] chars) { 360 int size = 0; 361 for (int character : chars) size += getCharSize(character); 362 return size; 363 } 364 365 /** 366 * Writes a char array to a byte buffer. 367 * 368 * @param codePoints the code point array to write. 369 * @param buffer the byte buffer to write to. 370 * @param index the index in buffer to write the character array to. 371 * @return the index after the last character. 372 */ 373 private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { 374 for (int codePoint : codePoints) { 375 if (1 == getCharSize(codePoint)) { 376 buffer[index++] = (byte)codePoint; 377 } else { 378 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 379 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 380 buffer[index++] = (byte)(0xFF & codePoint); 381 } 382 } 383 return index; 384 } 385 386 /** 387 * Writes a string with our character format to a byte buffer. 388 * 389 * This will also write the terminator byte. 390 * 391 * @param buffer the byte buffer to write to. 392 * @param origin the offset to write from. 393 * @param word the string to write. 394 * @return the size written, in bytes. 395 */ 396 private static int writeString(final byte[] buffer, final int origin, 397 final String word) { 398 final int length = word.length(); 399 int index = origin; 400 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 401 final int codePoint = word.codePointAt(i); 402 if (1 == getCharSize(codePoint)) { 403 buffer[index++] = (byte)codePoint; 404 } else { 405 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 406 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 407 buffer[index++] = (byte)(0xFF & codePoint); 408 } 409 } 410 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 411 return index - origin; 412 } 413 414 /** 415 * Writes a string with our character format to a ByteArrayOutputStream. 416 * 417 * This will also write the terminator byte. 418 * 419 * @param buffer the ByteArrayOutputStream to write to. 420 * @param word the string to write. 421 */ 422 private static void writeString(final ByteArrayOutputStream buffer, final String word) { 423 final int length = word.length(); 424 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 425 final int codePoint = word.codePointAt(i); 426 if (1 == getCharSize(codePoint)) { 427 buffer.write((byte) codePoint); 428 } else { 429 buffer.write((byte) (0xFF & (codePoint >> 16))); 430 buffer.write((byte) (0xFF & (codePoint >> 8))); 431 buffer.write((byte) (0xFF & codePoint)); 432 } 433 } 434 buffer.write(GROUP_CHARACTERS_TERMINATOR); 435 } 436 437 /** 438 * Reads a string from a buffer. This is the converse of the above method. 439 */ 440 private static String readString(final FusionDictionaryBufferInterface buffer) { 441 final StringBuilder s = new StringBuilder(); 442 int character = readChar(buffer); 443 while (character != INVALID_CHARACTER) { 444 s.appendCodePoint(character); 445 character = readChar(buffer); 446 } 447 return s.toString(); 448 } 449 450 /** 451 * Reads a character from the buffer. 452 * 453 * This follows the character format documented earlier in this source file. 454 * 455 * @param buffer the buffer, positioned over an encoded character. 456 * @return the character code. 457 */ 458 private static int readChar(final FusionDictionaryBufferInterface buffer) { 459 int character = buffer.readUnsignedByte(); 460 if (!fitsOnOneByte(character)) { 461 if (GROUP_CHARACTERS_TERMINATOR == character) return INVALID_CHARACTER; 462 character <<= 16; 463 character += buffer.readUnsignedShort(); 464 } 465 return character; 466 } 467 } 468 469 /** 470 * Compute the binary size of the character array in a group 471 * 472 * If only one character, this is the size of this character. If many, it's the sum of their 473 * sizes + 1 byte for the terminator. 474 * 475 * @param group the group 476 * @return the size of the char array, including the terminator if any 477 */ 478 private static int getGroupCharactersSize(final CharGroup group) { 479 int size = CharEncoding.getCharArraySize(group.mChars); 480 if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE; 481 return size; 482 } 483 484 /** 485 * Compute the binary size of the group count 486 * @param count the group count 487 * @return the size of the group count, either 1 or 2 bytes. 488 */ 489 private static int getGroupCountSize(final int count) { 490 if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) { 491 return 1; 492 } else if (MAX_CHARGROUPS_IN_A_NODE >= count) { 493 return 2; 494 } else { 495 throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE 496 + " groups in a node (found " + count + ")"); 497 } 498 } 499 500 /** 501 * Compute the binary size of the group count for a node 502 * @param node the node 503 * @return the size of the group count, either 1 or 2 bytes. 504 */ 505 private static int getGroupCountSize(final Node node) { 506 return getGroupCountSize(node.mData.size()); 507 } 508 509 /** 510 * Compute the size of a shortcut in bytes. 511 */ 512 private static int getShortcutSize(final WeightedString shortcut) { 513 int size = GROUP_ATTRIBUTE_FLAGS_SIZE; 514 final String word = shortcut.mWord; 515 final int length = word.length(); 516 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 517 final int codePoint = word.codePointAt(i); 518 size += CharEncoding.getCharSize(codePoint); 519 } 520 size += GROUP_TERMINATOR_SIZE; 521 return size; 522 } 523 524 /** 525 * Compute the size of a shortcut list in bytes. 526 * 527 * This is known in advance and does not change according to position in the file 528 * like address lists do. 529 */ 530 private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { 531 if (null == shortcutList) return 0; 532 int size = GROUP_SHORTCUT_LIST_SIZE_SIZE; 533 for (final WeightedString shortcut : shortcutList) { 534 size += getShortcutSize(shortcut); 535 } 536 return size; 537 } 538 539 /** 540 * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything. 541 * 542 * @param group the CharGroup to compute the size of. 543 * @param options file format options. 544 * @return the maximum size of the group. 545 */ 546 private static int getCharGroupMaximumSize(final CharGroup group, final FormatOptions options) { 547 int size = getGroupHeaderSize(group, options); 548 // If terminal, one byte for the frequency 549 if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; 550 size += GROUP_MAX_ADDRESS_SIZE; // For children address 551 size += getShortcutListSize(group.mShortcutTargets); 552 if (null != group.mBigrams) { 553 size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) 554 * group.mBigrams.size(); 555 } 556 return size; 557 } 558 559 /** 560 * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches 561 * it in the 'actualSize' member of the node. 562 * 563 * @param node the node to compute the maximum size of. 564 * @param options file format options. 565 */ 566 private static void setNodeMaximumSize(final Node node, final FormatOptions options) { 567 int size = getGroupCountSize(node); 568 for (CharGroup g : node.mData) { 569 final int groupSize = getCharGroupMaximumSize(g, options); 570 g.mCachedSize = groupSize; 571 size += groupSize; 572 } 573 node.mCachedSize = size; 574 } 575 576 /** 577 * Helper method to hide the actual value of the no children address. 578 */ 579 private static boolean hasChildrenAddress(final int address) { 580 return NO_CHILDREN_ADDRESS != address; 581 } 582 583 /** 584 * Helper method to check whether the CharGroup has a parent address. 585 */ 586 private static boolean hasParentAddress(final FormatOptions options) { 587 return options.mVersion >= FIRST_VERSION_WITH_PARENT_ADDRESS 588 && options.mHasParentAddress; 589 } 590 591 /** 592 * Compute the size of the header (flag + [parent address] + characters size) of a CharGroup. 593 * 594 * @param group the group of which to compute the size of the header 595 * @param options file format options. 596 */ 597 private static int getGroupHeaderSize(final CharGroup group, final FormatOptions options) { 598 if (hasParentAddress(options)) { 599 return GROUP_FLAGS_SIZE + PARENT_ADDRESS_SIZE + getGroupCharactersSize(group); 600 } else { 601 return GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 602 } 603 } 604 605 /** 606 * Compute the size, in bytes, that an address will occupy. 607 * 608 * This can be used either for children addresses (which are always positive) or for 609 * attribute, which may be positive or negative but 610 * store their sign bit separately. 611 * 612 * @param address the address 613 * @return the byte size. 614 */ 615 private static int getByteSize(final int address) { 616 assert(address < 0x1000000); 617 if (!hasChildrenAddress(address)) { 618 return 0; 619 } else if (Math.abs(address) < 0x100) { 620 return 1; 621 } else if (Math.abs(address) < 0x10000) { 622 return 2; 623 } else { 624 return 3; 625 } 626 } 627 // End utility methods. 628 629 // This method is responsible for finding a nice ordering of the nodes that favors run-time 630 // cache performance and dictionary size. 631 /* package for tests */ static ArrayList<Node> flattenTree(final Node root) { 632 final int treeSize = FusionDictionary.countCharGroups(root); 633 MakedictLog.i("Counted nodes : " + treeSize); 634 final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize); 635 return flattenTreeInner(flatTree, root); 636 } 637 638 private static ArrayList<Node> flattenTreeInner(final ArrayList<Node> list, final Node node) { 639 // Removing the node is necessary if the tails are merged, because we would then 640 // add the same node several times when we only want it once. A number of places in 641 // the code also depends on any node being only once in the list. 642 // Merging tails can only be done if there are no attributes. Searching for attributes 643 // in LatinIME code depends on a total breadth-first ordering, which merging tails 644 // breaks. If there are no attributes, it should be fine (and reduce the file size) 645 // to merge tails, and removing the node from the list would be necessary. However, 646 // we don't merge tails because breaking the breadth-first ordering would result in 647 // extreme overhead at bigram lookup time (it would make the search function O(n) instead 648 // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty 649 // high). 650 // If no nodes are ever merged, we can't have the same node twice in the list, hence 651 // searching for duplicates in unnecessary. It is also very performance consuming, 652 // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making 653 // this simple list.remove operation O(n*n) overall. On Android this overhead is very 654 // high. 655 // For future reference, the code to remove duplicate is a simple : list.remove(node); 656 list.add(node); 657 final ArrayList<CharGroup> branches = node.mData; 658 final int nodeSize = branches.size(); 659 for (CharGroup group : branches) { 660 if (null != group.mChildren) flattenTreeInner(list, group.mChildren); 661 } 662 return list; 663 } 664 665 /** 666 * Finds the absolute address of a word in the dictionary. 667 * 668 * @param dict the dictionary in which to search. 669 * @param word the word we are searching for. 670 * @return the word address. If it is not found, an exception is thrown. 671 */ 672 private static int findAddressOfWord(final FusionDictionary dict, final String word) { 673 return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress; 674 } 675 676 /** 677 * Computes the actual node size, based on the cached addresses of the children nodes. 678 * 679 * Each node stores its tentative address. During dictionary address computing, these 680 * are not final, but they can be used to compute the node size (the node size depends 681 * on the address of the children because the number of bytes necessary to store an 682 * address depends on its numeric value. The return value indicates whether the node 683 * contents (as in, any of the addresses stored in the cache fields) have changed with 684 * respect to their previous value. 685 * 686 * @param node the node to compute the size of. 687 * @param dict the dictionary in which the word/attributes are to be found. 688 * @param formatOptions file format options. 689 * @return false if none of the cached addresses inside the node changed, true otherwise. 690 */ 691 private static boolean computeActualNodeSize(final Node node, final FusionDictionary dict, 692 final FormatOptions formatOptions) { 693 boolean changed = false; 694 int size = getGroupCountSize(node); 695 for (CharGroup group : node.mData) { 696 if (group.mCachedAddress != node.mCachedAddress + size) { 697 changed = true; 698 group.mCachedAddress = node.mCachedAddress + size; 699 } 700 int groupSize = getGroupHeaderSize(group, formatOptions); 701 if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE; 702 if (null != group.mChildren) { 703 final int offsetBasePoint = groupSize + node.mCachedAddress + size; 704 final int offset = group.mChildren.mCachedAddress - offsetBasePoint; 705 // assign my address to children's parent address 706 group.mChildren.mCachedParentAddress = group.mCachedAddress 707 - group.mChildren.mCachedAddress; 708 groupSize += getByteSize(offset); 709 } 710 groupSize += getShortcutListSize(group.mShortcutTargets); 711 if (null != group.mBigrams) { 712 for (WeightedString bigram : group.mBigrams) { 713 final int offsetBasePoint = groupSize + node.mCachedAddress + size 714 + GROUP_FLAGS_SIZE; 715 final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); 716 final int offset = addressOfBigram - offsetBasePoint; 717 groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; 718 } 719 } 720 group.mCachedSize = groupSize; 721 size += groupSize; 722 } 723 if (node.mCachedSize != size) { 724 node.mCachedSize = size; 725 changed = true; 726 } 727 return changed; 728 } 729 730 /** 731 * Computes the byte size of a list of nodes and updates each node cached position. 732 * 733 * @param flatNodes the array of nodes. 734 * @return the byte size of the entire stack. 735 */ 736 private static int stackNodes(final ArrayList<Node> flatNodes) { 737 int nodeOffset = 0; 738 for (Node n : flatNodes) { 739 n.mCachedAddress = nodeOffset; 740 int groupCountSize = getGroupCountSize(n); 741 int groupOffset = 0; 742 for (CharGroup g : n.mData) { 743 g.mCachedAddress = groupCountSize + nodeOffset + groupOffset; 744 groupOffset += g.mCachedSize; 745 } 746 if (groupOffset + groupCountSize != n.mCachedSize) { 747 throw new RuntimeException("Bug : Stored and computed node size differ"); 748 } 749 nodeOffset += n.mCachedSize; 750 } 751 return nodeOffset; 752 } 753 754 /** 755 * Compute the addresses and sizes of an ordered node array. 756 * 757 * This method takes a node array and will update its cached address and size values 758 * so that they can be written into a file. It determines the smallest size each of the 759 * nodes can be given the addresses of its children and attributes, and store that into 760 * each node. 761 * The order of the node is given by the order of the array. This method makes no effort 762 * to find a good order; it only mechanically computes the size this order results in. 763 * 764 * @param dict the dictionary 765 * @param flatNodes the ordered array of nodes 766 * @param formatOptions file format options. 767 * @return the same array it was passed. The nodes have been updated for address and size. 768 */ 769 private static ArrayList<Node> computeAddresses(final FusionDictionary dict, 770 final ArrayList<Node> flatNodes, final FormatOptions formatOptions) { 771 // First get the worst sizes and offsets 772 for (Node n : flatNodes) setNodeMaximumSize(n, formatOptions); 773 final int offset = stackNodes(flatNodes); 774 775 MakedictLog.i("Compressing the array addresses. Original size : " + offset); 776 MakedictLog.i("(Recursively seen size : " + offset + ")"); 777 778 int passes = 0; 779 boolean changesDone = false; 780 do { 781 changesDone = false; 782 for (Node n : flatNodes) { 783 final int oldNodeSize = n.mCachedSize; 784 final boolean changed = computeActualNodeSize(n, dict, formatOptions); 785 final int newNodeSize = n.mCachedSize; 786 if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!"); 787 changesDone |= changed; 788 } 789 stackNodes(flatNodes); 790 ++passes; 791 if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); 792 } while (changesDone); 793 794 final Node lastNode = flatNodes.get(flatNodes.size() - 1); 795 MakedictLog.i("Compression complete in " + passes + " passes."); 796 MakedictLog.i("After address compression : " 797 + (lastNode.mCachedAddress + lastNode.mCachedSize)); 798 799 return flatNodes; 800 } 801 802 /** 803 * Sanity-checking method. 804 * 805 * This method checks an array of node for juxtaposition, that is, it will do 806 * nothing if each node's cached address is actually the previous node's address 807 * plus the previous node's size. 808 * If this is not the case, it will throw an exception. 809 * 810 * @param array the array node to check 811 */ 812 private static void checkFlatNodeArray(final ArrayList<Node> array) { 813 int offset = 0; 814 int index = 0; 815 for (Node n : array) { 816 if (n.mCachedAddress != offset) { 817 throw new RuntimeException("Wrong address for node " + index 818 + " : expected " + offset + ", got " + n.mCachedAddress); 819 } 820 ++index; 821 offset += n.mCachedSize; 822 } 823 } 824 825 /** 826 * Helper method to write a variable-size address to a file. 827 * 828 * @param buffer the buffer to write to. 829 * @param index the index in the buffer to write the address to. 830 * @param address the address to write. 831 * @return the size in bytes the address actually took. 832 */ 833 private static int writeVariableAddress(final byte[] buffer, int index, final int address) { 834 switch (getByteSize(address)) { 835 case 1: 836 buffer[index++] = (byte)address; 837 return 1; 838 case 2: 839 buffer[index++] = (byte)(0xFF & (address >> 8)); 840 buffer[index++] = (byte)(0xFF & address); 841 return 2; 842 case 3: 843 buffer[index++] = (byte)(0xFF & (address >> 16)); 844 buffer[index++] = (byte)(0xFF & (address >> 8)); 845 buffer[index++] = (byte)(0xFF & address); 846 return 3; 847 case 0: 848 return 0; 849 default: 850 throw new RuntimeException("Address " + address + " has a strange size"); 851 } 852 } 853 854 private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, 855 final int childrenOffset) { 856 byte flags = 0; 857 if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS; 858 if (group.mFrequency >= 0) { 859 flags |= FLAG_IS_TERMINAL; 860 } 861 if (null != group.mChildren) { 862 switch (getByteSize(childrenOffset)) { 863 case 1: 864 flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; 865 break; 866 case 2: 867 flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; 868 break; 869 case 3: 870 flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; 871 break; 872 default: 873 throw new RuntimeException("Node with a strange address"); 874 } 875 } 876 if (null != group.mShortcutTargets) { 877 if (DBG && 0 == group.mShortcutTargets.size()) { 878 throw new RuntimeException("0-sized shortcut list must be null"); 879 } 880 flags |= FLAG_HAS_SHORTCUT_TARGETS; 881 } 882 if (null != group.mBigrams) { 883 if (DBG && 0 == group.mBigrams.size()) { 884 throw new RuntimeException("0-sized bigram list must be null"); 885 } 886 flags |= FLAG_HAS_BIGRAMS; 887 } 888 if (group.mIsNotAWord) { 889 flags |= FLAG_IS_NOT_A_WORD; 890 } 891 if (group.mIsBlacklistEntry) { 892 flags |= FLAG_IS_BLACKLISTED; 893 } 894 return flags; 895 } 896 897 /** 898 * Makes the flag value for a bigram. 899 * 900 * @param more whether there are more bigrams after this one. 901 * @param offset the offset of the bigram. 902 * @param bigramFrequency the frequency of the bigram, 0..255. 903 * @param unigramFrequency the unigram frequency of the same word, 0..255. 904 * @param word the second bigram, for debugging purposes 905 * @return the flags 906 */ 907 private static final int makeBigramFlags(final boolean more, final int offset, 908 int bigramFrequency, final int unigramFrequency, final String word) { 909 int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) 910 + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); 911 switch (getByteSize(offset)) { 912 case 1: 913 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; 914 break; 915 case 2: 916 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; 917 break; 918 case 3: 919 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; 920 break; 921 default: 922 throw new RuntimeException("Strange offset size"); 923 } 924 if (unigramFrequency > bigramFrequency) { 925 MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word 926 + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " 927 + word + " is " + unigramFrequency); 928 bigramFrequency = unigramFrequency; 929 } 930 // We compute the difference between 255 (which means probability = 1) and the 931 // unigram score. We split this into a number of discrete steps. 932 // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 933 // represents an increase of 16 steps: a value of 15 will be interpreted as the median 934 // value of the 16th step. In all justice, if the bigram frequency is low enough to be 935 // rounded below the first step (which means it is less than half a step higher than the 936 // unigram frequency) then the unigram frequency itself is the best approximation of the 937 // bigram freq that we could possibly supply, hence we should *not* include this bigram 938 // in the file at all. 939 // until this is done, we'll write 0 and slightly overestimate this case. 940 // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step 941 // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to 942 // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the 943 // step size. Then we compute the start of the first step (the one where value 0 starts) 944 // by adding half-a-step to the unigramFrequency. From there, we compute the integer 945 // number of steps to the bigramFrequency. One last thing: we want our steps to include 946 // their lower bound and exclude their higher bound so we need to have the first step 947 // start at exactly 1 unit higher than floor(unigramFreq + half a step). 948 // Note : to reconstruct the score, the dictionary reader will need to divide 949 // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step, 950 // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best 951 // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the 952 // step pointed by the discretized frequency. 953 final float stepSize = 954 (MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5f + MAX_BIGRAM_FREQUENCY); 955 final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); 956 final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); 957 // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 958 // here. The best approximation would be the unigram freq itself, so we should not 959 // include this bigram in the dictionary. For now, register as 0, and live with the 960 // small over-estimation that we get in this case. TODO: actually remove this bigram 961 // if discretizedFrequency < 0. 962 final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0; 963 bigramFlags += finalBigramFrequency & FLAG_ATTRIBUTE_FREQUENCY; 964 return bigramFlags; 965 } 966 967 /** 968 * Makes the 2-byte value for options flags. 969 */ 970 private static final int makeOptionsValue(final FusionDictionary dictionary, 971 final FormatOptions formatOptions) { 972 final DictionaryOptions options = dictionary.mOptions; 973 final boolean hasBigrams = dictionary.hasBigrams(); 974 return (options.mFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0) 975 + (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0) 976 + (hasBigrams ? CONTAINS_BIGRAMS_FLAG : 0) 977 + (formatOptions.mHasParentAddress ? HAS_PARENT_ADDRESS : 0); 978 } 979 980 /** 981 * Makes the flag value for a shortcut. 982 * 983 * @param more whether there are more attributes after this one. 984 * @param frequency the frequency of the attribute, 0..15 985 * @return the flags 986 */ 987 private static final int makeShortcutFlags(final boolean more, final int frequency) { 988 return (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FLAG_ATTRIBUTE_FREQUENCY); 989 } 990 991 /** 992 * Write a node to memory. The node is expected to have its final position cached. 993 * 994 * This can be an empty map, but the more is inside the faster the lookups will be. It can 995 * be carried on as long as nodes do not move. 996 * 997 * @param dict the dictionary the node is a part of (for relative offsets). 998 * @param buffer the memory buffer to write to. 999 * @param node the node to write. 1000 * @param formatOptions file format options. 1001 * @return the address of the END of the node. 1002 */ 1003 private static int writePlacedNode(final FusionDictionary dict, byte[] buffer, 1004 final Node node, final FormatOptions formatOptions) { 1005 int index = node.mCachedAddress; 1006 1007 final int groupCount = node.mData.size(); 1008 final int countSize = getGroupCountSize(node); 1009 final int parentAddress = node.mCachedParentAddress; 1010 if (1 == countSize) { 1011 buffer[index++] = (byte)groupCount; 1012 } else if (2 == countSize) { 1013 // We need to signal 2-byte size by setting the top bit of the MSB to 1, so 1014 // we | 0x80 to do this. 1015 buffer[index++] = (byte)((groupCount >> 8) | 0x80); 1016 buffer[index++] = (byte)(groupCount & 0xFF); 1017 } else { 1018 throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); 1019 } 1020 int groupAddress = index; 1021 for (int i = 0; i < groupCount; ++i) { 1022 CharGroup group = node.mData.get(i); 1023 if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not " 1024 + "the same as the cached address of the group : " 1025 + index + " <> " + group.mCachedAddress); 1026 groupAddress += getGroupHeaderSize(group, formatOptions); 1027 // Sanity checks. 1028 if (DBG && group.mFrequency > MAX_TERMINAL_FREQUENCY) { 1029 throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY 1030 + " : " + group.mFrequency); 1031 } 1032 if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE; 1033 final int childrenOffset = null == group.mChildren 1034 ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress; 1035 byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset); 1036 buffer[index++] = flags; 1037 1038 if (hasParentAddress(formatOptions)) { 1039 if (parentAddress == NO_PARENT_ADDRESS) { 1040 // this node is the root node. 1041 buffer[index] = buffer[index + 1] = buffer[index + 2] = 0; 1042 } else { 1043 // write parent address. (version 3) 1044 final int actualParentAddress = Math.abs(parentAddress 1045 + (node.mCachedAddress - group.mCachedAddress)); 1046 buffer[index] = (byte)((actualParentAddress >> 16) & 0xFF); 1047 buffer[index + 1] = (byte)((actualParentAddress >> 8) & 0xFF); 1048 buffer[index + 2] = (byte)(actualParentAddress & 0xFF); 1049 } 1050 index += 3; 1051 } 1052 1053 index = CharEncoding.writeCharArray(group.mChars, buffer, index); 1054 if (group.hasSeveralChars()) { 1055 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 1056 } 1057 if (group.mFrequency >= 0) { 1058 buffer[index++] = (byte) group.mFrequency; 1059 } 1060 final int shift = writeVariableAddress(buffer, index, childrenOffset); 1061 index += shift; 1062 groupAddress += shift; 1063 1064 // Write shortcuts 1065 if (null != group.mShortcutTargets) { 1066 final int indexOfShortcutByteSize = index; 1067 index += GROUP_SHORTCUT_LIST_SIZE_SIZE; 1068 groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE; 1069 final Iterator<WeightedString> shortcutIterator = group.mShortcutTargets.iterator(); 1070 while (shortcutIterator.hasNext()) { 1071 final WeightedString target = shortcutIterator.next(); 1072 ++groupAddress; 1073 int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(), 1074 target.mFrequency); 1075 buffer[index++] = (byte)shortcutFlags; 1076 final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord); 1077 index += shortcutShift; 1078 groupAddress += shortcutShift; 1079 } 1080 final int shortcutByteSize = index - indexOfShortcutByteSize; 1081 if (shortcutByteSize > 0xFFFF) { 1082 throw new RuntimeException("Shortcut list too large"); 1083 } 1084 buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8); 1085 buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF); 1086 } 1087 // Write bigrams 1088 if (null != group.mBigrams) { 1089 final Iterator<WeightedString> bigramIterator = group.mBigrams.iterator(); 1090 while (bigramIterator.hasNext()) { 1091 final WeightedString bigram = bigramIterator.next(); 1092 final CharGroup target = 1093 FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord); 1094 final int addressOfBigram = target.mCachedAddress; 1095 final int unigramFrequencyForThisWord = target.mFrequency; 1096 ++groupAddress; 1097 final int offset = addressOfBigram - groupAddress; 1098 int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset, 1099 bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); 1100 buffer[index++] = (byte)bigramFlags; 1101 final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); 1102 index += bigramShift; 1103 groupAddress += bigramShift; 1104 } 1105 } 1106 1107 } 1108 if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException( 1109 "Not the same size : written " 1110 + (index - node.mCachedAddress) + " bytes out of a node that should have " 1111 + node.mCachedSize + " bytes"); 1112 return index; 1113 } 1114 1115 /** 1116 * Dumps a collection of useful statistics about a node array. 1117 * 1118 * This prints purely informative stuff, like the total estimated file size, the 1119 * number of nodes, of character groups, the repartition of each address size, etc 1120 * 1121 * @param nodes the node array. 1122 */ 1123 private static void showStatistics(ArrayList<Node> nodes) { 1124 int firstTerminalAddress = Integer.MAX_VALUE; 1125 int lastTerminalAddress = Integer.MIN_VALUE; 1126 int size = 0; 1127 int charGroups = 0; 1128 int maxGroups = 0; 1129 int maxRuns = 0; 1130 for (Node n : nodes) { 1131 if (maxGroups < n.mData.size()) maxGroups = n.mData.size(); 1132 for (CharGroup cg : n.mData) { 1133 ++charGroups; 1134 if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length; 1135 if (cg.mFrequency >= 0) { 1136 if (n.mCachedAddress < firstTerminalAddress) 1137 firstTerminalAddress = n.mCachedAddress; 1138 if (n.mCachedAddress > lastTerminalAddress) 1139 lastTerminalAddress = n.mCachedAddress; 1140 } 1141 } 1142 if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize; 1143 } 1144 final int[] groupCounts = new int[maxGroups + 1]; 1145 final int[] runCounts = new int[maxRuns + 1]; 1146 for (Node n : nodes) { 1147 ++groupCounts[n.mData.size()]; 1148 for (CharGroup cg : n.mData) { 1149 ++runCounts[cg.mChars.length]; 1150 } 1151 } 1152 1153 MakedictLog.i("Statistics:\n" 1154 + " total file size " + size + "\n" 1155 + " " + nodes.size() + " nodes\n" 1156 + " " + charGroups + " groups (" + ((float)charGroups / nodes.size()) 1157 + " groups per node)\n" 1158 + " first terminal at " + firstTerminalAddress + "\n" 1159 + " last terminal at " + lastTerminalAddress + "\n" 1160 + " Group stats : max = " + maxGroups); 1161 for (int i = 0; i < groupCounts.length; ++i) { 1162 MakedictLog.i(" " + i + " : " + groupCounts[i]); 1163 } 1164 MakedictLog.i(" Character run stats : max = " + maxRuns); 1165 for (int i = 0; i < runCounts.length; ++i) { 1166 MakedictLog.i(" " + i + " : " + runCounts[i]); 1167 } 1168 } 1169 1170 /** 1171 * Dumps a FusionDictionary to a file. 1172 * 1173 * This is the public entry point to write a dictionary to a file. 1174 * 1175 * @param destination the stream to write the binary data to. 1176 * @param dict the dictionary to write. 1177 * @param formatOptions file format options. 1178 */ 1179 public static void writeDictionaryBinary(final OutputStream destination, 1180 final FusionDictionary dict, final FormatOptions formatOptions) 1181 throws IOException, UnsupportedFormatException { 1182 1183 // Addresses are limited to 3 bytes, but since addresses can be relative to each node, the 1184 // structure itself is not limited to 16MB. However, if it is over 16MB deciding the order 1185 // of the nodes becomes a quite complicated problem, because though the dictionary itself 1186 // does not have a size limit, each node must still be within 16MB of all its children and 1187 // parents. As long as this is ensured, the dictionary file may grow to any size. 1188 1189 final int version = formatOptions.mVersion; 1190 if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { 1191 throw new UnsupportedFormatException("Requested file format version " + version 1192 + ", but this implementation only supports versions " 1193 + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION); 1194 } 1195 1196 ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); 1197 1198 // The magic number in big-endian order. 1199 if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { 1200 // Magic number for version 2+. 1201 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24))); 1202 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16))); 1203 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8))); 1204 headerBuffer.write((byte) (0xFF & VERSION_2_MAGIC_NUMBER)); 1205 // Dictionary version. 1206 headerBuffer.write((byte) (0xFF & (version >> 8))); 1207 headerBuffer.write((byte) (0xFF & version)); 1208 } else { 1209 // Magic number for version 1. 1210 headerBuffer.write((byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8))); 1211 headerBuffer.write((byte) (0xFF & VERSION_1_MAGIC_NUMBER)); 1212 // Dictionary version. 1213 headerBuffer.write((byte) (0xFF & version)); 1214 } 1215 // Options flags 1216 final int options = makeOptionsValue(dict, formatOptions); 1217 headerBuffer.write((byte) (0xFF & (options >> 8))); 1218 headerBuffer.write((byte) (0xFF & options)); 1219 if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { 1220 final int headerSizeOffset = headerBuffer.size(); 1221 // Placeholder to be written later with header size. 1222 for (int i = 0; i < 4; ++i) { 1223 headerBuffer.write(0); 1224 } 1225 // Write out the options. 1226 for (final String key : dict.mOptions.mAttributes.keySet()) { 1227 final String value = dict.mOptions.mAttributes.get(key); 1228 CharEncoding.writeString(headerBuffer, key); 1229 CharEncoding.writeString(headerBuffer, value); 1230 } 1231 final int size = headerBuffer.size(); 1232 final byte[] bytes = headerBuffer.toByteArray(); 1233 // Write out the header size. 1234 bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); 1235 bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); 1236 bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); 1237 bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); 1238 destination.write(bytes); 1239 } else { 1240 headerBuffer.writeTo(destination); 1241 } 1242 1243 headerBuffer.close(); 1244 1245 // Leave the choice of the optimal node order to the flattenTree function. 1246 MakedictLog.i("Flattening the tree..."); 1247 ArrayList<Node> flatNodes = flattenTree(dict.mRoot); 1248 1249 MakedictLog.i("Computing addresses..."); 1250 computeAddresses(dict, flatNodes, formatOptions); 1251 MakedictLog.i("Checking array..."); 1252 if (DBG) checkFlatNodeArray(flatNodes); 1253 1254 // Create a buffer that matches the final dictionary size. 1255 final Node lastNode = flatNodes.get(flatNodes.size() - 1); 1256 final int bufferSize = lastNode.mCachedAddress + lastNode.mCachedSize; 1257 final byte[] buffer = new byte[bufferSize]; 1258 int index = 0; 1259 1260 MakedictLog.i("Writing file..."); 1261 int dataEndOffset = 0; 1262 for (Node n : flatNodes) { 1263 dataEndOffset = writePlacedNode(dict, buffer, n, formatOptions); 1264 } 1265 1266 if (DBG) showStatistics(flatNodes); 1267 1268 destination.write(buffer, 0, dataEndOffset); 1269 1270 destination.close(); 1271 MakedictLog.i("Done"); 1272 } 1273 1274 1275 // Input methods: Read a binary dictionary to memory. 1276 // readDictionaryBinary is the public entry point for them. 1277 1278 private static final int[] CHARACTER_BUFFER = new int[MAX_WORD_LENGTH]; 1279 private static CharGroupInfo readCharGroup(final FusionDictionaryBufferInterface buffer, 1280 final int originalGroupAddress, final FormatOptions options) { 1281 int addressPointer = originalGroupAddress; 1282 final int flags = buffer.readUnsignedByte(); 1283 ++addressPointer; 1284 1285 final int parentAddress; 1286 if (hasParentAddress(options)) { 1287 // read the parent address. (version 3) 1288 parentAddress = -buffer.readUnsignedInt24(); 1289 addressPointer += 3; 1290 } else { 1291 parentAddress = NO_PARENT_ADDRESS; 1292 } 1293 1294 final int characters[]; 1295 if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) { 1296 int index = 0; 1297 int character = CharEncoding.readChar(buffer); 1298 addressPointer += CharEncoding.getCharSize(character); 1299 while (-1 != character) { 1300 // FusionDictionary is making sure that the length of the word is smaller than 1301 // MAX_WORD_LENGTH. 1302 // So we'll never write past the end of CHARACTER_BUFFER. 1303 CHARACTER_BUFFER[index++] = character; 1304 character = CharEncoding.readChar(buffer); 1305 addressPointer += CharEncoding.getCharSize(character); 1306 } 1307 characters = Arrays.copyOfRange(CHARACTER_BUFFER, 0, index); 1308 } else { 1309 final int character = CharEncoding.readChar(buffer); 1310 addressPointer += CharEncoding.getCharSize(character); 1311 characters = new int[] { character }; 1312 } 1313 final int frequency; 1314 if (0 != (FLAG_IS_TERMINAL & flags)) { 1315 ++addressPointer; 1316 frequency = buffer.readUnsignedByte(); 1317 } else { 1318 frequency = CharGroup.NOT_A_TERMINAL; 1319 } 1320 int childrenAddress = addressPointer; 1321 switch (flags & MASK_GROUP_ADDRESS_TYPE) { 1322 case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 1323 childrenAddress += buffer.readUnsignedByte(); 1324 addressPointer += 1; 1325 break; 1326 case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 1327 childrenAddress += buffer.readUnsignedShort(); 1328 addressPointer += 2; 1329 break; 1330 case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 1331 childrenAddress += buffer.readUnsignedInt24(); 1332 addressPointer += 3; 1333 break; 1334 case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: 1335 default: 1336 childrenAddress = NO_CHILDREN_ADDRESS; 1337 break; 1338 } 1339 ArrayList<WeightedString> shortcutTargets = null; 1340 if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { 1341 final int pointerBefore = buffer.position(); 1342 shortcutTargets = new ArrayList<WeightedString>(); 1343 buffer.readUnsignedShort(); // Skip the size 1344 while (true) { 1345 final int targetFlags = buffer.readUnsignedByte(); 1346 final String word = CharEncoding.readString(buffer); 1347 shortcutTargets.add(new WeightedString(word, 1348 targetFlags & FLAG_ATTRIBUTE_FREQUENCY)); 1349 if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; 1350 } 1351 addressPointer += buffer.position() - pointerBefore; 1352 } 1353 ArrayList<PendingAttribute> bigrams = null; 1354 if (0 != (flags & FLAG_HAS_BIGRAMS)) { 1355 bigrams = new ArrayList<PendingAttribute>(); 1356 while (true) { 1357 final int bigramFlags = buffer.readUnsignedByte(); 1358 ++addressPointer; 1359 final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; 1360 int bigramAddress = addressPointer; 1361 switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { 1362 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 1363 bigramAddress += sign * buffer.readUnsignedByte(); 1364 addressPointer += 1; 1365 break; 1366 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 1367 bigramAddress += sign * buffer.readUnsignedShort(); 1368 addressPointer += 2; 1369 break; 1370 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 1371 final int offset = (buffer.readUnsignedByte() << 16) 1372 + buffer.readUnsignedShort(); 1373 bigramAddress += sign * offset; 1374 addressPointer += 3; 1375 break; 1376 default: 1377 throw new RuntimeException("Has bigrams with no address"); 1378 } 1379 bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY, 1380 bigramAddress)); 1381 if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; 1382 } 1383 } 1384 return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, 1385 parentAddress, childrenAddress, shortcutTargets, bigrams); 1386 } 1387 1388 /** 1389 * Reads and returns the char group count out of a buffer and forwards the pointer. 1390 */ 1391 private static int readCharGroupCount(final FusionDictionaryBufferInterface buffer) { 1392 final int msb = buffer.readUnsignedByte(); 1393 if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) { 1394 return msb; 1395 } else { 1396 return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8) 1397 + buffer.readUnsignedByte(); 1398 } 1399 } 1400 1401 // The word cache here is a stopgap bandaid to help the catastrophic performance 1402 // of this method. Since it performs direct, unbuffered random access to the file and 1403 // may be called hundreds of thousands of times, the resulting performance is not 1404 // reasonable without some kind of cache. Thus: 1405 private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>(); 1406 /** 1407 * Finds, as a string, the word at the address passed as an argument. 1408 * 1409 * @param buffer the buffer to read from. 1410 * @param headerSize the size of the header. 1411 * @param address the address to seek. 1412 * @param formatOptions file format options. 1413 * @return the word, as a string. 1414 */ 1415 private static String getWordAtAddress(final FusionDictionaryBufferInterface buffer, 1416 final int headerSize, final int address, final FormatOptions formatOptions) { 1417 final String cachedString = wordCache.get(address); 1418 if (null != cachedString) return cachedString; 1419 1420 final String result; 1421 final int originalPointer = buffer.position(); 1422 1423 if (hasParentAddress(formatOptions)) { 1424 result = getWordAtAddressWithParentAddress(buffer, headerSize, address, formatOptions); 1425 } else { 1426 result = getWordAtAddressWithoutParentAddress(buffer, headerSize, address, 1427 formatOptions); 1428 } 1429 1430 wordCache.put(address, result); 1431 buffer.position(originalPointer); 1432 return result; 1433 } 1434 1435 private static int[] sGetWordBuffer = new int[MAX_WORD_LENGTH]; 1436 private static String getWordAtAddressWithParentAddress( 1437 final FusionDictionaryBufferInterface buffer, final int headerSize, final int address, 1438 final FormatOptions options) { 1439 final StringBuilder builder = new StringBuilder(); 1440 1441 int currentAddress = address; 1442 int index = MAX_WORD_LENGTH - 1; 1443 // the length of the path from the root to the leaf is limited by MAX_WORD_LENGTH 1444 for (int count = 0; count < MAX_WORD_LENGTH; ++count) { 1445 buffer.position(currentAddress + headerSize); 1446 final CharGroupInfo currentInfo = readCharGroup(buffer, currentAddress, options); 1447 for (int i = 0; i < currentInfo.mCharacters.length; ++i) { 1448 sGetWordBuffer[index--] = 1449 currentInfo.mCharacters[currentInfo.mCharacters.length - i - 1]; 1450 } 1451 1452 if (currentInfo.mParentAddress == NO_PARENT_ADDRESS) break; 1453 currentAddress = currentInfo.mParentAddress + currentInfo.mOriginalAddress; 1454 } 1455 1456 return new String(sGetWordBuffer, index + 1, MAX_WORD_LENGTH - index - 1); 1457 } 1458 1459 private static String getWordAtAddressWithoutParentAddress( 1460 final FusionDictionaryBufferInterface buffer, final int headerSize, final int address, 1461 final FormatOptions options) { 1462 buffer.position(headerSize); 1463 final int count = readCharGroupCount(buffer); 1464 int groupOffset = getGroupCountSize(count); 1465 final StringBuilder builder = new StringBuilder(); 1466 String result = null; 1467 1468 CharGroupInfo last = null; 1469 for (int i = count - 1; i >= 0; --i) { 1470 CharGroupInfo info = readCharGroup(buffer, groupOffset, options); 1471 groupOffset = info.mEndAddress; 1472 if (info.mOriginalAddress == address) { 1473 builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); 1474 result = builder.toString(); 1475 break; // and return 1476 } 1477 if (hasChildrenAddress(info.mChildrenAddress)) { 1478 if (info.mChildrenAddress > address) { 1479 if (null == last) continue; 1480 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 1481 buffer.position(last.mChildrenAddress + headerSize); 1482 groupOffset = last.mChildrenAddress + 1; 1483 i = buffer.readUnsignedByte(); 1484 last = null; 1485 continue; 1486 } 1487 last = info; 1488 } 1489 if (0 == i && hasChildrenAddress(last.mChildrenAddress)) { 1490 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 1491 buffer.position(last.mChildrenAddress + headerSize); 1492 groupOffset = last.mChildrenAddress + 1; 1493 i = buffer.readUnsignedByte(); 1494 last = null; 1495 continue; 1496 } 1497 } 1498 return result; 1499 } 1500 1501 /** 1502 * Reads a single node from a buffer. 1503 * 1504 * This methods reads the file at the current position. A node is fully expected to start at 1505 * the current position. 1506 * This will recursively read other nodes into the structure, populating the reverse 1507 * maps on the fly and using them to keep track of already read nodes. 1508 * 1509 * @param buffer the buffer, correctly positioned at the start of a node. 1510 * @param headerSize the size, in bytes, of the file header. 1511 * @param reverseNodeMap a mapping from addresses to already read nodes. 1512 * @param reverseGroupMap a mapping from addresses to already read character groups. 1513 * @param options file format options. 1514 * @return the read node with all his children already read. 1515 */ 1516 private static Node readNode(final FusionDictionaryBufferInterface buffer, final int headerSize, 1517 final Map<Integer, Node> reverseNodeMap, final Map<Integer, CharGroup> reverseGroupMap, 1518 final FormatOptions options) 1519 throws IOException { 1520 final int nodeOrigin = buffer.position() - headerSize; 1521 final int count = readCharGroupCount(buffer); 1522 final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>(); 1523 int groupOffset = nodeOrigin + getGroupCountSize(count); 1524 for (int i = count; i > 0; --i) { 1525 CharGroupInfo info = readCharGroup(buffer, groupOffset, options); 1526 ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets; 1527 ArrayList<WeightedString> bigrams = null; 1528 if (null != info.mBigrams) { 1529 bigrams = new ArrayList<WeightedString>(); 1530 for (PendingAttribute bigram : info.mBigrams) { 1531 final String word = getWordAtAddress( 1532 buffer, headerSize, bigram.mAddress, options); 1533 bigrams.add(new WeightedString(word, bigram.mFrequency)); 1534 } 1535 } 1536 if (hasChildrenAddress(info.mChildrenAddress)) { 1537 Node children = reverseNodeMap.get(info.mChildrenAddress); 1538 if (null == children) { 1539 final int currentPosition = buffer.position(); 1540 buffer.position(info.mChildrenAddress + headerSize); 1541 children = readNode( 1542 buffer, headerSize, reverseNodeMap, reverseGroupMap, options); 1543 buffer.position(currentPosition); 1544 } 1545 nodeContents.add( 1546 new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, 1547 0 != (info.mFlags & FLAG_IS_NOT_A_WORD), 1548 0 != (info.mFlags & FLAG_IS_BLACKLISTED), children)); 1549 } else { 1550 nodeContents.add( 1551 new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, 1552 0 != (info.mFlags & FLAG_IS_NOT_A_WORD), 1553 0 != (info.mFlags & FLAG_IS_BLACKLISTED))); 1554 } 1555 groupOffset = info.mEndAddress; 1556 } 1557 final Node node = new Node(nodeContents); 1558 node.mCachedAddress = nodeOrigin; 1559 reverseNodeMap.put(node.mCachedAddress, node); 1560 return node; 1561 } 1562 1563 // TODO: move these methods (readUnigramsAndBigramsBinary(|Inner)) and an inner class (Position) 1564 // out of this class. 1565 private static class Position { 1566 public static final int NOT_READ_GROUPCOUNT = -1; 1567 1568 public int mAddress; 1569 public int mNumOfCharGroup; 1570 public int mPosition; 1571 public int mLength; 1572 1573 public Position(int address, int length) { 1574 mAddress = address; 1575 mLength = length; 1576 mNumOfCharGroup = NOT_READ_GROUPCOUNT; 1577 } 1578 } 1579 1580 /** 1581 * Tours all node without recursive call. 1582 */ 1583 private static void readUnigramsAndBigramsBinaryInner( 1584 final FusionDictionaryBufferInterface buffer, final int headerSize, 1585 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 1586 final Map<Integer, ArrayList<PendingAttribute>> bigrams, 1587 final FormatOptions formatOptions) { 1588 int[] pushedChars = new int[MAX_WORD_LENGTH + 1]; 1589 1590 Stack<Position> stack = new Stack<Position>(); 1591 int index = 0; 1592 1593 Position initPos = new Position(headerSize, 0); 1594 stack.push(initPos); 1595 1596 while (!stack.empty()) { 1597 Position p = stack.peek(); 1598 1599 if (DBG) { 1600 MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" + 1601 p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength); 1602 } 1603 1604 if (buffer.position() != p.mAddress) buffer.position(p.mAddress); 1605 if (index != p.mLength) index = p.mLength; 1606 1607 if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) { 1608 p.mNumOfCharGroup = readCharGroupCount(buffer); 1609 p.mAddress += getGroupCountSize(p.mNumOfCharGroup); 1610 p.mPosition = 0; 1611 } 1612 1613 CharGroupInfo info = readCharGroup(buffer, p.mAddress - headerSize, formatOptions); 1614 for (int i = 0; i < info.mCharacters.length; ++i) { 1615 pushedChars[index++] = info.mCharacters[i]; 1616 } 1617 p.mPosition++; 1618 1619 if (info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) { // found word 1620 words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); 1621 frequencies.put(info.mOriginalAddress, info.mFrequency); 1622 if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); 1623 } 1624 1625 if (p.mPosition == p.mNumOfCharGroup) { 1626 stack.pop(); 1627 } else { 1628 // the node has more groups. 1629 p.mAddress = buffer.position(); 1630 } 1631 1632 if (hasChildrenAddress(info.mChildrenAddress)) { 1633 Position childrenPos = new Position(info.mChildrenAddress + headerSize, index); 1634 stack.push(childrenPos); 1635 } 1636 } 1637 } 1638 1639 /** 1640 * Reads unigrams and bigrams from the binary file. 1641 * Doesn't make the memory representation of the dictionary. 1642 * 1643 * @param buffer the buffer to read. 1644 * @param words the map to store the address as a key and the word as a value. 1645 * @param frequencies the map to store the address as a key and the frequency as a value. 1646 * @param bigrams the map to store the address as a key and the list of address as a value. 1647 * @throws IOException 1648 * @throws UnsupportedFormatException 1649 */ 1650 public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer, 1651 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 1652 final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, 1653 UnsupportedFormatException { 1654 // Read header 1655 final FileHeader header = readHeader(buffer); 1656 readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams, 1657 header.mFormatOptions); 1658 } 1659 1660 /** 1661 * Helper function to get the binary format version from the header. 1662 * @throws IOException 1663 */ 1664 private static int getFormatVersion(final FusionDictionaryBufferInterface buffer) 1665 throws IOException { 1666 final int magic_v1 = buffer.readUnsignedShort(); 1667 if (VERSION_1_MAGIC_NUMBER == magic_v1) return buffer.readUnsignedByte(); 1668 final int magic_v2 = (magic_v1 << 16) + buffer.readUnsignedShort(); 1669 if (VERSION_2_MAGIC_NUMBER == magic_v2) return buffer.readUnsignedShort(); 1670 return NOT_A_VERSION_NUMBER; 1671 } 1672 1673 /** 1674 * Helper function to get and validate the binary format version. 1675 * @throws UnsupportedFormatException 1676 * @throws IOException 1677 */ 1678 private static int checkFormatVersion(final FusionDictionaryBufferInterface buffer) 1679 throws IOException, UnsupportedFormatException { 1680 final int version = getFormatVersion(buffer); 1681 if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { 1682 throw new UnsupportedFormatException("This file has version " + version 1683 + ", but this implementation does not support versions above " 1684 + MAXIMUM_SUPPORTED_VERSION); 1685 } 1686 return version; 1687 } 1688 1689 /** 1690 * Reads a header from a buffer. 1691 * @param buffer the buffer to read. 1692 * @throws IOException 1693 * @throws UnsupportedFormatException 1694 */ 1695 private static FileHeader readHeader(final FusionDictionaryBufferInterface buffer) 1696 throws IOException, UnsupportedFormatException { 1697 final int version = checkFormatVersion(buffer); 1698 final int optionsFlags = buffer.readUnsignedShort(); 1699 1700 final HashMap<String, String> attributes = new HashMap<String, String>(); 1701 final int headerSize; 1702 if (version < FIRST_VERSION_WITH_HEADER_SIZE) { 1703 headerSize = buffer.position(); 1704 } else { 1705 headerSize = buffer.readInt(); 1706 populateOptions(buffer, headerSize, attributes); 1707 buffer.position(headerSize); 1708 } 1709 1710 if (headerSize < 0) { 1711 throw new UnsupportedFormatException("header size can't be negative."); 1712 } 1713 1714 final FileHeader header = new FileHeader(headerSize, 1715 new FusionDictionary.DictionaryOptions(attributes, 1716 0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG), 1717 0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)), 1718 new FormatOptions(version, 1719 0 != (optionsFlags & HAS_PARENT_ADDRESS))); 1720 return header; 1721 } 1722 1723 /** 1724 * Reads options from a buffer and populate a map with their contents. 1725 * 1726 * The buffer is read at the current position, so the caller must take care the pointer 1727 * is in the right place before calling this. 1728 */ 1729 public static void populateOptions(final FusionDictionaryBufferInterface buffer, 1730 final int headerSize, final HashMap<String, String> options) { 1731 while (buffer.position() < headerSize) { 1732 final String key = CharEncoding.readString(buffer); 1733 final String value = CharEncoding.readString(buffer); 1734 options.put(key, value); 1735 } 1736 } 1737 // TODO: remove this method. 1738 public static void populateOptions(final ByteBuffer buffer, final int headerSize, 1739 final HashMap<String, String> options) { 1740 populateOptions(new ByteBufferWrapper(buffer), headerSize, options); 1741 } 1742 1743 /** 1744 * Reads a buffer and returns the memory representation of the dictionary. 1745 * 1746 * This high-level method takes a buffer and reads its contents, populating a 1747 * FusionDictionary structure. The optional dict argument is an existing dictionary to 1748 * which words from the buffer should be added. If it is null, a new dictionary is created. 1749 * 1750 * @param buffer the buffer to read. 1751 * @param dict an optional dictionary to add words to, or null. 1752 * @return the created (or merged) dictionary. 1753 */ 1754 public static FusionDictionary readDictionaryBinary( 1755 final FusionDictionaryBufferInterface buffer, final FusionDictionary dict) 1756 throws IOException, UnsupportedFormatException { 1757 // clear cache 1758 wordCache.clear(); 1759 1760 // Read header 1761 final FileHeader header = readHeader(buffer); 1762 1763 Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>(); 1764 Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>(); 1765 final Node root = readNode(buffer, header.mHeaderSize, reverseNodeMapping, 1766 reverseGroupMapping, header.mFormatOptions); 1767 1768 FusionDictionary newDict = new FusionDictionary(root, header.mDictionaryOptions); 1769 if (null != dict) { 1770 for (final Word w : dict) { 1771 if (w.mIsBlacklistEntry) { 1772 newDict.addBlacklistEntry(w.mWord, w.mShortcutTargets, w.mIsNotAWord); 1773 } else { 1774 newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mIsNotAWord); 1775 } 1776 } 1777 for (final Word w : dict) { 1778 // By construction a binary dictionary may not have bigrams pointing to 1779 // words that are not also registered as unigrams so we don't have to avoid 1780 // them explicitly here. 1781 for (final WeightedString bigram : w.mBigrams) { 1782 newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency); 1783 } 1784 } 1785 } 1786 1787 return newDict; 1788 } 1789 1790 // TODO: remove this method. 1791 public static FusionDictionary readDictionaryBinary(final ByteBuffer buffer, 1792 final FusionDictionary dict) throws IOException, UnsupportedFormatException { 1793 return readDictionaryBinary(new ByteBufferWrapper(buffer), dict); 1794 } 1795 1796 /** 1797 * Basic test to find out whether the file is a binary dictionary or not. 1798 * 1799 * Concretely this only tests the magic number. 1800 * 1801 * @param filename The name of the file to test. 1802 * @return true if it's a binary dictionary, false otherwise 1803 */ 1804 public static boolean isBinaryDictionary(final String filename) { 1805 FileInputStream inStream = null; 1806 try { 1807 final File file = new File(filename); 1808 inStream = new FileInputStream(file); 1809 final ByteBuffer buffer = inStream.getChannel().map( 1810 FileChannel.MapMode.READ_ONLY, 0, file.length()); 1811 final int version = getFormatVersion(new ByteBufferWrapper(buffer)); 1812 return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION); 1813 } catch (FileNotFoundException e) { 1814 return false; 1815 } catch (IOException e) { 1816 return false; 1817 } finally { 1818 if (inStream != null) { 1819 try { 1820 inStream.close(); 1821 } catch (IOException e) { 1822 // do nothing 1823 } 1824 } 1825 } 1826 } 1827 1828 /** 1829 * Calculate bigram frequency from compressed value 1830 * 1831 * @see #makeBigramFlags 1832 * 1833 * @param unigramFrequency 1834 * @param bigramFrequency compressed frequency 1835 * @return approximate bigram frequency 1836 */ 1837 public static int reconstructBigramFrequency(final int unigramFrequency, 1838 final int bigramFrequency) { 1839 final float stepSize = (MAX_TERMINAL_FREQUENCY - unigramFrequency) 1840 / (1.5f + MAX_BIGRAM_FREQUENCY); 1841 final float resultFreqFloat = (float)unigramFrequency 1842 + stepSize * (bigramFrequency + 1.0f); 1843 return (int)resultFreqFloat; 1844 } 1845} 1846