BinaryDictInputOutput.java revision eae7b293e4a854819aa0de663066cd0b6cdd52e7
1/* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.latin.makedict; 18 19import com.android.inputmethod.latin.Constants; 20import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; 21import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions; 22import com.android.inputmethod.latin.makedict.FusionDictionary.Node; 23import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 24 25import java.io.ByteArrayOutputStream; 26import java.io.File; 27import java.io.FileInputStream; 28import java.io.FileNotFoundException; 29import java.io.IOException; 30import java.io.OutputStream; 31import java.nio.ByteBuffer; 32import java.nio.channels.FileChannel; 33import java.util.ArrayList; 34import java.util.Arrays; 35import java.util.HashMap; 36import java.util.Iterator; 37import java.util.Map; 38import java.util.Stack; 39import java.util.TreeMap; 40 41/** 42 * Reads and writes XML files for a FusionDictionary. 43 * 44 * All the methods in this class are static. 45 */ 46public class BinaryDictInputOutput { 47 48 final static boolean DBG = MakedictLog.DBG; 49 50 /* Node layout is as follows: 51 * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE 52 * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 53 * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE 54 * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES 55 * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES 56 * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS 57 * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL 58 * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS 59 * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS 60 * | is not a word ? 1 bit, 1 = yes, 0 = no : FLAG_IS_NOT_A_WORD 61 * | is blacklisted ? 1 bit, 1 = yes, 0 = no : FLAG_IS_BLACKLISTED 62 * 63 * c | IF FLAG_HAS_MULTIPLE_CHARS 64 * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers 65 * a | end 1 byte, = 0 66 * r | ELSE 67 * s | char 1 or 3 bytes 68 * | END 69 * 70 * f | 71 * r | IF FLAG_IS_TERMINAL 72 * e | frequency 1 byte 73 * q | 74 * 75 * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType 76 * h | // nothing 77 * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType 78 * l | children address, 1 byte 79 * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType 80 * r | children address, 2 bytes 81 * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType 82 * n | children address, 3 bytes 83 * A | END 84 * d 85 * dress 86 * 87 * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS 88 * | shortcut string list 89 * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS 90 * | bigrams address list 91 * 92 * Char format is: 93 * 1 byte = bbbbbbbb match 94 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 95 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 96 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 97 * 00011111 would be outside unicode. 98 * else: iso-latin-1 code 99 * This allows for the whole unicode range to be encoded, including chars outside of 100 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 101 * characters which should never happen anyway (and still work, but take 3 bytes). 102 * 103 * bigram address list is: 104 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 105 * | addressSign = 1 bit, : FLAG_ATTRIBUTE_OFFSET_NEGATIVE 106 * | 1 = must take -address, 0 = must take +address 107 * | xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE 108 * | addressFormat = 2 bits, 00 = unused : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 109 * | 01 = 1 byte : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 110 * | 10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES 111 * | 11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES 112 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 113 * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat) 114 * | read 1 byte, add top 4 bits 115 * | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat) 116 * | read 2 bytes, add top 4 bits 117 * | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat 118 * | read 3 bytes, add top 4 bits 119 * | END 120 * | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address 121 * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is 122 * 123 * shortcut string list is: 124 * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes. 125 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 126 * | reserved = 3 bits, must be 0 127 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 128 * <shortcut> = | string of characters at the char format described above, with the terminator 129 * | used to signal the end of the string. 130 * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags 131 */ 132 133 private static final int VERSION_1_MAGIC_NUMBER = 0x78B1; 134 public static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; 135 private static final int MINIMUM_SUPPORTED_VERSION = 1; 136 private static final int MAXIMUM_SUPPORTED_VERSION = 2; 137 private static final int NOT_A_VERSION_NUMBER = -1; 138 private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2; 139 140 // These options need to be the same numeric values as the one in the native reading code. 141 private static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; 142 private static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; 143 private static final int CONTAINS_BIGRAMS_FLAG = 0x8; 144 145 // TODO: Make this value adaptative to content data, store it in the header, and 146 // use it in the reading code. 147 private static final int MAX_WORD_LENGTH = Constants.Dictionary.MAX_WORD_LENGTH; 148 149 private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; 150 private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; 151 private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; 152 private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; 153 private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; 154 155 private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; 156 157 private static final int FLAG_IS_TERMINAL = 0x10; 158 private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; 159 private static final int FLAG_HAS_BIGRAMS = 0x04; 160 private static final int FLAG_IS_NOT_A_WORD = 0x02; 161 private static final int FLAG_IS_BLACKLISTED = 0x01; 162 163 private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; 164 private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; 165 private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; 166 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; 167 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; 168 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; 169 private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F; 170 171 private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F; 172 173 private static final int GROUP_TERMINATOR_SIZE = 1; 174 private static final int GROUP_FLAGS_SIZE = 1; 175 private static final int GROUP_FREQUENCY_SIZE = 1; 176 private static final int GROUP_MAX_ADDRESS_SIZE = 3; 177 private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1; 178 private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; 179 private static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2; 180 181 private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; 182 private static final int INVALID_CHARACTER = -1; 183 184 private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 185 private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 186 187 private static final int MAX_TERMINAL_FREQUENCY = 255; 188 private static final int MAX_BIGRAM_FREQUENCY = 15; 189 190 // Arbitrary limit to how much passes we consider address size compression should 191 // terminate in. At the time of this writing, our largest dictionary completes 192 // compression in five passes. 193 // If the number of passes exceeds this number, makedict bails with an exception on 194 // suspicion that a bug might be causing an infinite loop. 195 private static final int MAX_PASSES = 24; 196 197 public interface FusionDictionaryBufferInterface { 198 public int readUnsignedByte(); 199 public int readUnsignedShort(); 200 public int readUnsignedInt24(); 201 public int readInt(); 202 public int position(); 203 public void position(int newPosition); 204 } 205 206 public static final class ByteBufferWrapper implements FusionDictionaryBufferInterface { 207 private ByteBuffer mBuffer; 208 209 public ByteBufferWrapper(final ByteBuffer buffer) { 210 mBuffer = buffer; 211 } 212 213 @Override 214 public int readUnsignedByte() { 215 return ((int)mBuffer.get()) & 0xFF; 216 } 217 218 @Override 219 public int readUnsignedShort() { 220 return ((int)mBuffer.getShort()) & 0xFFFF; 221 } 222 223 @Override 224 public int readUnsignedInt24() { 225 final int retval = readUnsignedByte(); 226 return (retval << 16) + readUnsignedShort(); 227 } 228 229 @Override 230 public int readInt() { 231 return mBuffer.getInt(); 232 } 233 234 @Override 235 public int position() { 236 return mBuffer.position(); 237 } 238 239 @Override 240 public void position(int newPos) { 241 mBuffer.position(newPos); 242 } 243 } 244 245 /** 246 * Options about file format. 247 */ 248 public static class FormatOptions { 249 public final int mVersion; 250 public FormatOptions(final int version) { 251 mVersion = version; 252 } 253 } 254 255 /** 256 * Class representing file header. 257 */ 258 private static final class FileHeader { 259 public final int mHeaderSize; 260 public final DictionaryOptions mDictionaryOptions; 261 public final FormatOptions mFormatOptions; 262 public FileHeader(final int headerSize, final DictionaryOptions dictionaryOptions, 263 final FormatOptions formatOptions) { 264 mHeaderSize = headerSize; 265 mDictionaryOptions = dictionaryOptions; 266 mFormatOptions = formatOptions; 267 } 268 } 269 270 /** 271 * A class grouping utility function for our specific character encoding. 272 */ 273 private static class CharEncoding { 274 275 private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 276 private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; 277 278 /** 279 * Helper method to find out whether this code fits on one byte 280 */ 281 private static boolean fitsOnOneByte(int character) { 282 return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE 283 && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; 284 } 285 286 /** 287 * Compute the size of a character given its character code. 288 * 289 * Char format is: 290 * 1 byte = bbbbbbbb match 291 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 292 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 293 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 294 * 00011111 would be outside unicode. 295 * else: iso-latin-1 code 296 * This allows for the whole unicode range to be encoded, including chars outside of 297 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 298 * characters which should never happen anyway (and still work, but take 3 bytes). 299 * 300 * @param character the character code. 301 * @return the size in binary encoded-form, either 1 or 3 bytes. 302 */ 303 private static int getCharSize(int character) { 304 // See char encoding in FusionDictionary.java 305 if (fitsOnOneByte(character)) return 1; 306 if (INVALID_CHARACTER == character) return 1; 307 return 3; 308 } 309 310 /** 311 * Compute the byte size of a character array. 312 */ 313 private static int getCharArraySize(final int[] chars) { 314 int size = 0; 315 for (int character : chars) size += getCharSize(character); 316 return size; 317 } 318 319 /** 320 * Writes a char array to a byte buffer. 321 * 322 * @param codePoints the code point array to write. 323 * @param buffer the byte buffer to write to. 324 * @param index the index in buffer to write the character array to. 325 * @return the index after the last character. 326 */ 327 private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { 328 for (int codePoint : codePoints) { 329 if (1 == getCharSize(codePoint)) { 330 buffer[index++] = (byte)codePoint; 331 } else { 332 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 333 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 334 buffer[index++] = (byte)(0xFF & codePoint); 335 } 336 } 337 return index; 338 } 339 340 /** 341 * Writes a string with our character format to a byte buffer. 342 * 343 * This will also write the terminator byte. 344 * 345 * @param buffer the byte buffer to write to. 346 * @param origin the offset to write from. 347 * @param word the string to write. 348 * @return the size written, in bytes. 349 */ 350 private static int writeString(final byte[] buffer, final int origin, 351 final String word) { 352 final int length = word.length(); 353 int index = origin; 354 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 355 final int codePoint = word.codePointAt(i); 356 if (1 == getCharSize(codePoint)) { 357 buffer[index++] = (byte)codePoint; 358 } else { 359 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 360 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 361 buffer[index++] = (byte)(0xFF & codePoint); 362 } 363 } 364 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 365 return index - origin; 366 } 367 368 /** 369 * Writes a string with our character format to a ByteArrayOutputStream. 370 * 371 * This will also write the terminator byte. 372 * 373 * @param buffer the ByteArrayOutputStream to write to. 374 * @param word the string to write. 375 */ 376 private static void writeString(ByteArrayOutputStream buffer, final String word) { 377 final int length = word.length(); 378 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 379 final int codePoint = word.codePointAt(i); 380 if (1 == getCharSize(codePoint)) { 381 buffer.write((byte) codePoint); 382 } else { 383 buffer.write((byte) (0xFF & (codePoint >> 16))); 384 buffer.write((byte) (0xFF & (codePoint >> 8))); 385 buffer.write((byte) (0xFF & codePoint)); 386 } 387 } 388 buffer.write(GROUP_CHARACTERS_TERMINATOR); 389 } 390 391 /** 392 * Reads a string from a buffer. This is the converse of the above method. 393 */ 394 private static String readString(final FusionDictionaryBufferInterface buffer) { 395 final StringBuilder s = new StringBuilder(); 396 int character = readChar(buffer); 397 while (character != INVALID_CHARACTER) { 398 s.appendCodePoint(character); 399 character = readChar(buffer); 400 } 401 return s.toString(); 402 } 403 404 /** 405 * Reads a character from the buffer. 406 * 407 * This follows the character format documented earlier in this source file. 408 * 409 * @param buffer the buffer, positioned over an encoded character. 410 * @return the character code. 411 */ 412 private static int readChar(final FusionDictionaryBufferInterface buffer) { 413 int character = buffer.readUnsignedByte(); 414 if (!fitsOnOneByte(character)) { 415 if (GROUP_CHARACTERS_TERMINATOR == character) return INVALID_CHARACTER; 416 character <<= 16; 417 character += buffer.readUnsignedShort(); 418 } 419 return character; 420 } 421 } 422 423 /** 424 * Compute the binary size of the character array in a group 425 * 426 * If only one character, this is the size of this character. If many, it's the sum of their 427 * sizes + 1 byte for the terminator. 428 * 429 * @param group the group 430 * @return the size of the char array, including the terminator if any 431 */ 432 private static int getGroupCharactersSize(CharGroup group) { 433 int size = CharEncoding.getCharArraySize(group.mChars); 434 if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE; 435 return size; 436 } 437 438 /** 439 * Compute the binary size of the group count 440 * @param count the group count 441 * @return the size of the group count, either 1 or 2 bytes. 442 */ 443 private static int getGroupCountSize(final int count) { 444 if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) { 445 return 1; 446 } else if (MAX_CHARGROUPS_IN_A_NODE >= count) { 447 return 2; 448 } else { 449 throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE 450 + " groups in a node (found " + count +")"); 451 } 452 } 453 454 /** 455 * Compute the binary size of the group count for a node 456 * @param node the node 457 * @return the size of the group count, either 1 or 2 bytes. 458 */ 459 private static int getGroupCountSize(final Node node) { 460 return getGroupCountSize(node.mData.size()); 461 } 462 463 /** 464 * Compute the size of a shortcut in bytes. 465 */ 466 private static int getShortcutSize(final WeightedString shortcut) { 467 int size = GROUP_ATTRIBUTE_FLAGS_SIZE; 468 final String word = shortcut.mWord; 469 final int length = word.length(); 470 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 471 final int codePoint = word.codePointAt(i); 472 size += CharEncoding.getCharSize(codePoint); 473 } 474 size += GROUP_TERMINATOR_SIZE; 475 return size; 476 } 477 478 /** 479 * Compute the size of a shortcut list in bytes. 480 * 481 * This is known in advance and does not change according to position in the file 482 * like address lists do. 483 */ 484 private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { 485 if (null == shortcutList) return 0; 486 int size = GROUP_SHORTCUT_LIST_SIZE_SIZE; 487 for (final WeightedString shortcut : shortcutList) { 488 size += getShortcutSize(shortcut); 489 } 490 return size; 491 } 492 493 /** 494 * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything. 495 * 496 * @param group the CharGroup to compute the size of. 497 * @return the maximum size of the group. 498 */ 499 private static int getCharGroupMaximumSize(CharGroup group) { 500 int size = getGroupCharactersSize(group) + GROUP_FLAGS_SIZE; 501 // If terminal, one byte for the frequency 502 if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; 503 size += GROUP_MAX_ADDRESS_SIZE; // For children address 504 size += getShortcutListSize(group.mShortcutTargets); 505 if (null != group.mBigrams) { 506 size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) 507 * group.mBigrams.size(); 508 } 509 return size; 510 } 511 512 /** 513 * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches 514 * it in the 'actualSize' member of the node. 515 * 516 * @param node the node to compute the maximum size of. 517 */ 518 private static void setNodeMaximumSize(Node node) { 519 int size = getGroupCountSize(node); 520 for (CharGroup g : node.mData) { 521 final int groupSize = getCharGroupMaximumSize(g); 522 g.mCachedSize = groupSize; 523 size += groupSize; 524 } 525 node.mCachedSize = size; 526 } 527 528 /** 529 * Helper method to hide the actual value of the no children address. 530 */ 531 private static boolean hasChildrenAddress(int address) { 532 return NO_CHILDREN_ADDRESS != address; 533 } 534 535 /** 536 * Compute the size, in bytes, that an address will occupy. 537 * 538 * This can be used either for children addresses (which are always positive) or for 539 * attribute, which may be positive or negative but 540 * store their sign bit separately. 541 * 542 * @param address the address 543 * @return the byte size. 544 */ 545 private static int getByteSize(int address) { 546 assert(address < 0x1000000); 547 if (!hasChildrenAddress(address)) { 548 return 0; 549 } else if (Math.abs(address) < 0x100) { 550 return 1; 551 } else if (Math.abs(address) < 0x10000) { 552 return 2; 553 } else { 554 return 3; 555 } 556 } 557 // End utility methods. 558 559 // This method is responsible for finding a nice ordering of the nodes that favors run-time 560 // cache performance and dictionary size. 561 /* package for tests */ static ArrayList<Node> flattenTree(Node root) { 562 final int treeSize = FusionDictionary.countCharGroups(root); 563 MakedictLog.i("Counted nodes : " + treeSize); 564 final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize); 565 return flattenTreeInner(flatTree, root); 566 } 567 568 private static ArrayList<Node> flattenTreeInner(ArrayList<Node> list, Node node) { 569 // Removing the node is necessary if the tails are merged, because we would then 570 // add the same node several times when we only want it once. A number of places in 571 // the code also depends on any node being only once in the list. 572 // Merging tails can only be done if there are no attributes. Searching for attributes 573 // in LatinIME code depends on a total breadth-first ordering, which merging tails 574 // breaks. If there are no attributes, it should be fine (and reduce the file size) 575 // to merge tails, and removing the node from the list would be necessary. However, 576 // we don't merge tails because breaking the breadth-first ordering would result in 577 // extreme overhead at bigram lookup time (it would make the search function O(n) instead 578 // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty 579 // high). 580 // If no nodes are ever merged, we can't have the same node twice in the list, hence 581 // searching for duplicates in unnecessary. It is also very performance consuming, 582 // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making 583 // this simple list.remove operation O(n*n) overall. On Android this overhead is very 584 // high. 585 // For future reference, the code to remove duplicate is a simple : list.remove(node); 586 list.add(node); 587 final ArrayList<CharGroup> branches = node.mData; 588 final int nodeSize = branches.size(); 589 for (CharGroup group : branches) { 590 if (null != group.mChildren) flattenTreeInner(list, group.mChildren); 591 } 592 return list; 593 } 594 595 /** 596 * Finds the absolute address of a word in the dictionary. 597 * 598 * @param dict the dictionary in which to search. 599 * @param word the word we are searching for. 600 * @return the word address. If it is not found, an exception is thrown. 601 */ 602 private static int findAddressOfWord(final FusionDictionary dict, final String word) { 603 return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress; 604 } 605 606 /** 607 * Computes the actual node size, based on the cached addresses of the children nodes. 608 * 609 * Each node stores its tentative address. During dictionary address computing, these 610 * are not final, but they can be used to compute the node size (the node size depends 611 * on the address of the children because the number of bytes necessary to store an 612 * address depends on its numeric value. The return value indicates whether the node 613 * contents (as in, any of the addresses stored in the cache fields) have changed with 614 * respect to their previous value. 615 * 616 * @param node the node to compute the size of. 617 * @param dict the dictionary in which the word/attributes are to be found. 618 * @return false if none of the cached addresses inside the node changed, true otherwise. 619 */ 620 private static boolean computeActualNodeSize(Node node, FusionDictionary dict) { 621 boolean changed = false; 622 int size = getGroupCountSize(node); 623 for (CharGroup group : node.mData) { 624 if (group.mCachedAddress != node.mCachedAddress + size) { 625 changed = true; 626 group.mCachedAddress = node.mCachedAddress + size; 627 } 628 int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 629 if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE; 630 if (null != group.mChildren) { 631 final int offsetBasePoint= groupSize + node.mCachedAddress + size; 632 final int offset = group.mChildren.mCachedAddress - offsetBasePoint; 633 groupSize += getByteSize(offset); 634 } 635 groupSize += getShortcutListSize(group.mShortcutTargets); 636 if (null != group.mBigrams) { 637 for (WeightedString bigram : group.mBigrams) { 638 final int offsetBasePoint = groupSize + node.mCachedAddress + size 639 + GROUP_FLAGS_SIZE; 640 final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); 641 final int offset = addressOfBigram - offsetBasePoint; 642 groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; 643 } 644 } 645 group.mCachedSize = groupSize; 646 size += groupSize; 647 } 648 if (node.mCachedSize != size) { 649 node.mCachedSize = size; 650 changed = true; 651 } 652 return changed; 653 } 654 655 /** 656 * Computes the byte size of a list of nodes and updates each node cached position. 657 * 658 * @param flatNodes the array of nodes. 659 * @return the byte size of the entire stack. 660 */ 661 private static int stackNodes(ArrayList<Node> flatNodes) { 662 int nodeOffset = 0; 663 for (Node n : flatNodes) { 664 n.mCachedAddress = nodeOffset; 665 int groupCountSize = getGroupCountSize(n); 666 int groupOffset = 0; 667 for (CharGroup g : n.mData) { 668 g.mCachedAddress = groupCountSize + nodeOffset + groupOffset; 669 groupOffset += g.mCachedSize; 670 } 671 if (groupOffset + groupCountSize != n.mCachedSize) { 672 throw new RuntimeException("Bug : Stored and computed node size differ"); 673 } 674 nodeOffset += n.mCachedSize; 675 } 676 return nodeOffset; 677 } 678 679 /** 680 * Compute the addresses and sizes of an ordered node array. 681 * 682 * This method takes a node array and will update its cached address and size values 683 * so that they can be written into a file. It determines the smallest size each of the 684 * nodes can be given the addresses of its children and attributes, and store that into 685 * each node. 686 * The order of the node is given by the order of the array. This method makes no effort 687 * to find a good order; it only mechanically computes the size this order results in. 688 * 689 * @param dict the dictionary 690 * @param flatNodes the ordered array of nodes 691 * @return the same array it was passed. The nodes have been updated for address and size. 692 */ 693 private static ArrayList<Node> computeAddresses(FusionDictionary dict, 694 ArrayList<Node> flatNodes) { 695 // First get the worst sizes and offsets 696 for (Node n : flatNodes) setNodeMaximumSize(n); 697 final int offset = stackNodes(flatNodes); 698 699 MakedictLog.i("Compressing the array addresses. Original size : " + offset); 700 MakedictLog.i("(Recursively seen size : " + offset + ")"); 701 702 int passes = 0; 703 boolean changesDone = false; 704 do { 705 changesDone = false; 706 for (Node n : flatNodes) { 707 final int oldNodeSize = n.mCachedSize; 708 final boolean changed = computeActualNodeSize(n, dict); 709 final int newNodeSize = n.mCachedSize; 710 if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!"); 711 changesDone |= changed; 712 } 713 stackNodes(flatNodes); 714 ++passes; 715 if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); 716 } while (changesDone); 717 718 final Node lastNode = flatNodes.get(flatNodes.size() - 1); 719 MakedictLog.i("Compression complete in " + passes + " passes."); 720 MakedictLog.i("After address compression : " 721 + (lastNode.mCachedAddress + lastNode.mCachedSize)); 722 723 return flatNodes; 724 } 725 726 /** 727 * Sanity-checking method. 728 * 729 * This method checks an array of node for juxtaposition, that is, it will do 730 * nothing if each node's cached address is actually the previous node's address 731 * plus the previous node's size. 732 * If this is not the case, it will throw an exception. 733 * 734 * @param array the array node to check 735 */ 736 private static void checkFlatNodeArray(ArrayList<Node> array) { 737 int offset = 0; 738 int index = 0; 739 for (Node n : array) { 740 if (n.mCachedAddress != offset) { 741 throw new RuntimeException("Wrong address for node " + index 742 + " : expected " + offset + ", got " + n.mCachedAddress); 743 } 744 ++index; 745 offset += n.mCachedSize; 746 } 747 } 748 749 /** 750 * Helper method to write a variable-size address to a file. 751 * 752 * @param buffer the buffer to write to. 753 * @param index the index in the buffer to write the address to. 754 * @param address the address to write. 755 * @return the size in bytes the address actually took. 756 */ 757 private static int writeVariableAddress(final byte[] buffer, int index, final int address) { 758 switch (getByteSize(address)) { 759 case 1: 760 buffer[index++] = (byte)address; 761 return 1; 762 case 2: 763 buffer[index++] = (byte)(0xFF & (address >> 8)); 764 buffer[index++] = (byte)(0xFF & address); 765 return 2; 766 case 3: 767 buffer[index++] = (byte)(0xFF & (address >> 16)); 768 buffer[index++] = (byte)(0xFF & (address >> 8)); 769 buffer[index++] = (byte)(0xFF & address); 770 return 3; 771 case 0: 772 return 0; 773 default: 774 throw new RuntimeException("Address " + address + " has a strange size"); 775 } 776 } 777 778 private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, 779 final int childrenOffset) { 780 byte flags = 0; 781 if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS; 782 if (group.mFrequency >= 0) { 783 flags |= FLAG_IS_TERMINAL; 784 } 785 if (null != group.mChildren) { 786 switch (getByteSize(childrenOffset)) { 787 case 1: 788 flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; 789 break; 790 case 2: 791 flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; 792 break; 793 case 3: 794 flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; 795 break; 796 default: 797 throw new RuntimeException("Node with a strange address"); 798 } 799 } 800 if (null != group.mShortcutTargets) { 801 if (DBG && 0 == group.mShortcutTargets.size()) { 802 throw new RuntimeException("0-sized shortcut list must be null"); 803 } 804 flags |= FLAG_HAS_SHORTCUT_TARGETS; 805 } 806 if (null != group.mBigrams) { 807 if (DBG && 0 == group.mBigrams.size()) { 808 throw new RuntimeException("0-sized bigram list must be null"); 809 } 810 flags |= FLAG_HAS_BIGRAMS; 811 } 812 if (group.mIsNotAWord) { 813 flags |= FLAG_IS_NOT_A_WORD; 814 } 815 if (group.mIsBlacklistEntry) { 816 flags |= FLAG_IS_BLACKLISTED; 817 } 818 return flags; 819 } 820 821 /** 822 * Makes the flag value for a bigram. 823 * 824 * @param more whether there are more bigrams after this one. 825 * @param offset the offset of the bigram. 826 * @param bigramFrequency the frequency of the bigram, 0..255. 827 * @param unigramFrequency the unigram frequency of the same word, 0..255. 828 * @param word the second bigram, for debugging purposes 829 * @return the flags 830 */ 831 private static final int makeBigramFlags(final boolean more, final int offset, 832 int bigramFrequency, final int unigramFrequency, final String word) { 833 int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) 834 + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); 835 switch (getByteSize(offset)) { 836 case 1: 837 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; 838 break; 839 case 2: 840 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; 841 break; 842 case 3: 843 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; 844 break; 845 default: 846 throw new RuntimeException("Strange offset size"); 847 } 848 if (unigramFrequency > bigramFrequency) { 849 MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word 850 + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " 851 + word + " is " + unigramFrequency); 852 bigramFrequency = unigramFrequency; 853 } 854 // We compute the difference between 255 (which means probability = 1) and the 855 // unigram score. We split this into a number of discrete steps. 856 // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 857 // represents an increase of 16 steps: a value of 15 will be interpreted as the median 858 // value of the 16th step. In all justice, if the bigram frequency is low enough to be 859 // rounded below the first step (which means it is less than half a step higher than the 860 // unigram frequency) then the unigram frequency itself is the best approximation of the 861 // bigram freq that we could possibly supply, hence we should *not* include this bigram 862 // in the file at all. 863 // until this is done, we'll write 0 and slightly overestimate this case. 864 // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step 865 // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to 866 // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the 867 // step size. Then we compute the start of the first step (the one where value 0 starts) 868 // by adding half-a-step to the unigramFrequency. From there, we compute the integer 869 // number of steps to the bigramFrequency. One last thing: we want our steps to include 870 // their lower bound and exclude their higher bound so we need to have the first step 871 // start at exactly 1 unit higher than floor(unigramFreq + half a step). 872 // Note : to reconstruct the score, the dictionary reader will need to divide 873 // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step, 874 // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best 875 // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the 876 // step pointed by the discretized frequency. 877 final float stepSize = 878 (MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5f + MAX_BIGRAM_FREQUENCY); 879 final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); 880 final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); 881 // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 882 // here. The best approximation would be the unigram freq itself, so we should not 883 // include this bigram in the dictionary. For now, register as 0, and live with the 884 // small over-estimation that we get in this case. TODO: actually remove this bigram 885 // if discretizedFrequency < 0. 886 final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0; 887 bigramFlags += finalBigramFrequency & FLAG_ATTRIBUTE_FREQUENCY; 888 return bigramFlags; 889 } 890 891 /** 892 * Makes the 2-byte value for options flags. 893 */ 894 private static final int makeOptionsValue(final FusionDictionary dictionary) { 895 final DictionaryOptions options = dictionary.mOptions; 896 final boolean hasBigrams = dictionary.hasBigrams(); 897 return (options.mFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0) 898 + (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0) 899 + (hasBigrams ? CONTAINS_BIGRAMS_FLAG : 0); 900 } 901 902 /** 903 * Makes the flag value for a shortcut. 904 * 905 * @param more whether there are more attributes after this one. 906 * @param frequency the frequency of the attribute, 0..15 907 * @return the flags 908 */ 909 private static final int makeShortcutFlags(final boolean more, final int frequency) { 910 return (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FLAG_ATTRIBUTE_FREQUENCY); 911 } 912 913 /** 914 * Write a node to memory. The node is expected to have its final position cached. 915 * 916 * This can be an empty map, but the more is inside the faster the lookups will be. It can 917 * be carried on as long as nodes do not move. 918 * 919 * @param dict the dictionary the node is a part of (for relative offsets). 920 * @param buffer the memory buffer to write to. 921 * @param node the node to write. 922 * @return the address of the END of the node. 923 */ 924 private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) { 925 int index = node.mCachedAddress; 926 927 final int groupCount = node.mData.size(); 928 final int countSize = getGroupCountSize(node); 929 if (1 == countSize) { 930 buffer[index++] = (byte)groupCount; 931 } else if (2 == countSize) { 932 // We need to signal 2-byte size by setting the top bit of the MSB to 1, so 933 // we | 0x80 to do this. 934 buffer[index++] = (byte)((groupCount >> 8) | 0x80); 935 buffer[index++] = (byte)(groupCount & 0xFF); 936 } else { 937 throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); 938 } 939 int groupAddress = index; 940 for (int i = 0; i < groupCount; ++i) { 941 CharGroup group = node.mData.get(i); 942 if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not " 943 + "the same as the cached address of the group : " 944 + index + " <> " + group.mCachedAddress); 945 groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 946 // Sanity checks. 947 if (DBG && group.mFrequency > MAX_TERMINAL_FREQUENCY) { 948 throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY 949 + " : " + group.mFrequency); 950 } 951 if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE; 952 final int childrenOffset = null == group.mChildren 953 ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress; 954 byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset); 955 buffer[index++] = flags; 956 index = CharEncoding.writeCharArray(group.mChars, buffer, index); 957 if (group.hasSeveralChars()) { 958 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 959 } 960 if (group.mFrequency >= 0) { 961 buffer[index++] = (byte) group.mFrequency; 962 } 963 final int shift = writeVariableAddress(buffer, index, childrenOffset); 964 index += shift; 965 groupAddress += shift; 966 967 // Write shortcuts 968 if (null != group.mShortcutTargets) { 969 final int indexOfShortcutByteSize = index; 970 index += GROUP_SHORTCUT_LIST_SIZE_SIZE; 971 groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE; 972 final Iterator<WeightedString> shortcutIterator = group.mShortcutTargets.iterator(); 973 while (shortcutIterator.hasNext()) { 974 final WeightedString target = shortcutIterator.next(); 975 ++groupAddress; 976 int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(), 977 target.mFrequency); 978 buffer[index++] = (byte)shortcutFlags; 979 final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord); 980 index += shortcutShift; 981 groupAddress += shortcutShift; 982 } 983 final int shortcutByteSize = index - indexOfShortcutByteSize; 984 if (shortcutByteSize > 0xFFFF) { 985 throw new RuntimeException("Shortcut list too large"); 986 } 987 buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8); 988 buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF); 989 } 990 // Write bigrams 991 if (null != group.mBigrams) { 992 final Iterator<WeightedString> bigramIterator = group.mBigrams.iterator(); 993 while (bigramIterator.hasNext()) { 994 final WeightedString bigram = bigramIterator.next(); 995 final CharGroup target = 996 FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord); 997 final int addressOfBigram = target.mCachedAddress; 998 final int unigramFrequencyForThisWord = target.mFrequency; 999 ++groupAddress; 1000 final int offset = addressOfBigram - groupAddress; 1001 int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset, 1002 bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); 1003 buffer[index++] = (byte)bigramFlags; 1004 final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); 1005 index += bigramShift; 1006 groupAddress += bigramShift; 1007 } 1008 } 1009 1010 } 1011 if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException( 1012 "Not the same size : written " 1013 + (index - node.mCachedAddress) + " bytes out of a node that should have " 1014 + node.mCachedSize + " bytes"); 1015 return index; 1016 } 1017 1018 /** 1019 * Dumps a collection of useful statistics about a node array. 1020 * 1021 * This prints purely informative stuff, like the total estimated file size, the 1022 * number of nodes, of character groups, the repartition of each address size, etc 1023 * 1024 * @param nodes the node array. 1025 */ 1026 private static void showStatistics(ArrayList<Node> nodes) { 1027 int firstTerminalAddress = Integer.MAX_VALUE; 1028 int lastTerminalAddress = Integer.MIN_VALUE; 1029 int size = 0; 1030 int charGroups = 0; 1031 int maxGroups = 0; 1032 int maxRuns = 0; 1033 for (Node n : nodes) { 1034 if (maxGroups < n.mData.size()) maxGroups = n.mData.size(); 1035 for (CharGroup cg : n.mData) { 1036 ++charGroups; 1037 if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length; 1038 if (cg.mFrequency >= 0) { 1039 if (n.mCachedAddress < firstTerminalAddress) 1040 firstTerminalAddress = n.mCachedAddress; 1041 if (n.mCachedAddress > lastTerminalAddress) 1042 lastTerminalAddress = n.mCachedAddress; 1043 } 1044 } 1045 if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize; 1046 } 1047 final int[] groupCounts = new int[maxGroups + 1]; 1048 final int[] runCounts = new int[maxRuns + 1]; 1049 for (Node n : nodes) { 1050 ++groupCounts[n.mData.size()]; 1051 for (CharGroup cg : n.mData) { 1052 ++runCounts[cg.mChars.length]; 1053 } 1054 } 1055 1056 MakedictLog.i("Statistics:\n" 1057 + " total file size " + size + "\n" 1058 + " " + nodes.size() + " nodes\n" 1059 + " " + charGroups + " groups (" + ((float)charGroups / nodes.size()) 1060 + " groups per node)\n" 1061 + " first terminal at " + firstTerminalAddress + "\n" 1062 + " last terminal at " + lastTerminalAddress + "\n" 1063 + " Group stats : max = " + maxGroups); 1064 for (int i = 0; i < groupCounts.length; ++i) { 1065 MakedictLog.i(" " + i + " : " + groupCounts[i]); 1066 } 1067 MakedictLog.i(" Character run stats : max = " + maxRuns); 1068 for (int i = 0; i < runCounts.length; ++i) { 1069 MakedictLog.i(" " + i + " : " + runCounts[i]); 1070 } 1071 } 1072 1073 /** 1074 * Dumps a FusionDictionary to a file. 1075 * 1076 * This is the public entry point to write a dictionary to a file. 1077 * 1078 * @param destination the stream to write the binary data to. 1079 * @param dict the dictionary to write. 1080 * @param formatOptions the options of file format. 1081 */ 1082 public static void writeDictionaryBinary(final OutputStream destination, 1083 final FusionDictionary dict, final FormatOptions formatOptions) 1084 throws IOException, UnsupportedFormatException { 1085 1086 // Addresses are limited to 3 bytes, but since addresses can be relative to each node, the 1087 // structure itself is not limited to 16MB. However, if it is over 16MB deciding the order 1088 // of the nodes becomes a quite complicated problem, because though the dictionary itself 1089 // does not have a size limit, each node must still be within 16MB of all its children and 1090 // parents. As long as this is ensured, the dictionary file may grow to any size. 1091 1092 final int version = formatOptions.mVersion; 1093 if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { 1094 throw new UnsupportedFormatException("Requested file format version " + version 1095 + ", but this implementation only supports versions " 1096 + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION); 1097 } 1098 1099 ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); 1100 1101 // The magic number in big-endian order. 1102 if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { 1103 // Magic number for version 2+. 1104 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24))); 1105 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16))); 1106 headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8))); 1107 headerBuffer.write((byte) (0xFF & VERSION_2_MAGIC_NUMBER)); 1108 // Dictionary version. 1109 headerBuffer.write((byte) (0xFF & (version >> 8))); 1110 headerBuffer.write((byte) (0xFF & version)); 1111 } else { 1112 // Magic number for version 1. 1113 headerBuffer.write((byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8))); 1114 headerBuffer.write((byte) (0xFF & VERSION_1_MAGIC_NUMBER)); 1115 // Dictionary version. 1116 headerBuffer.write((byte) (0xFF & version)); 1117 } 1118 // Options flags 1119 final int options = makeOptionsValue(dict); 1120 headerBuffer.write((byte) (0xFF & (options >> 8))); 1121 headerBuffer.write((byte) (0xFF & options)); 1122 if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { 1123 final int headerSizeOffset = headerBuffer.size(); 1124 // Placeholder to be written later with header size. 1125 for (int i = 0; i < 4; ++i) { 1126 headerBuffer.write(0); 1127 } 1128 // Write out the options. 1129 for (final String key : dict.mOptions.mAttributes.keySet()) { 1130 final String value = dict.mOptions.mAttributes.get(key); 1131 CharEncoding.writeString(headerBuffer, key); 1132 CharEncoding.writeString(headerBuffer, value); 1133 } 1134 final int size = headerBuffer.size(); 1135 final byte[] bytes = headerBuffer.toByteArray(); 1136 // Write out the header size. 1137 bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); 1138 bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); 1139 bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); 1140 bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); 1141 destination.write(bytes); 1142 } else { 1143 headerBuffer.writeTo(destination); 1144 } 1145 1146 headerBuffer.close(); 1147 1148 // Leave the choice of the optimal node order to the flattenTree function. 1149 MakedictLog.i("Flattening the tree..."); 1150 ArrayList<Node> flatNodes = flattenTree(dict.mRoot); 1151 1152 MakedictLog.i("Computing addresses..."); 1153 computeAddresses(dict, flatNodes); 1154 MakedictLog.i("Checking array..."); 1155 if (DBG) checkFlatNodeArray(flatNodes); 1156 1157 // Create a buffer that matches the final dictionary size. 1158 final Node lastNode = flatNodes.get(flatNodes.size() - 1); 1159 final int bufferSize =(lastNode.mCachedAddress + lastNode.mCachedSize); 1160 final byte[] buffer = new byte[bufferSize]; 1161 int index = 0; 1162 1163 MakedictLog.i("Writing file..."); 1164 int dataEndOffset = 0; 1165 for (Node n : flatNodes) { 1166 dataEndOffset = writePlacedNode(dict, buffer, n); 1167 } 1168 1169 if (DBG) showStatistics(flatNodes); 1170 1171 destination.write(buffer, 0, dataEndOffset); 1172 1173 destination.close(); 1174 MakedictLog.i("Done"); 1175 } 1176 1177 1178 // Input methods: Read a binary dictionary to memory. 1179 // readDictionaryBinary is the public entry point for them. 1180 1181 static final int[] characterBuffer = new int[MAX_WORD_LENGTH]; 1182 private static CharGroupInfo readCharGroup(final FusionDictionaryBufferInterface buffer, 1183 final int originalGroupAddress) { 1184 int addressPointer = originalGroupAddress; 1185 final int flags = buffer.readUnsignedByte(); 1186 ++addressPointer; 1187 final int characters[]; 1188 if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) { 1189 int index = 0; 1190 int character = CharEncoding.readChar(buffer); 1191 addressPointer += CharEncoding.getCharSize(character); 1192 while (-1 != character) { 1193 characterBuffer[index++] = character; 1194 character = CharEncoding.readChar(buffer); 1195 addressPointer += CharEncoding.getCharSize(character); 1196 } 1197 characters = Arrays.copyOfRange(characterBuffer, 0, index); 1198 } else { 1199 final int character = CharEncoding.readChar(buffer); 1200 addressPointer += CharEncoding.getCharSize(character); 1201 characters = new int[] { character }; 1202 } 1203 final int frequency; 1204 if (0 != (FLAG_IS_TERMINAL & flags)) { 1205 ++addressPointer; 1206 frequency = buffer.readUnsignedByte(); 1207 } else { 1208 frequency = CharGroup.NOT_A_TERMINAL; 1209 } 1210 int childrenAddress = addressPointer; 1211 switch (flags & MASK_GROUP_ADDRESS_TYPE) { 1212 case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 1213 childrenAddress += buffer.readUnsignedByte(); 1214 addressPointer += 1; 1215 break; 1216 case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 1217 childrenAddress += buffer.readUnsignedShort(); 1218 addressPointer += 2; 1219 break; 1220 case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 1221 childrenAddress += buffer.readUnsignedInt24(); 1222 addressPointer += 3; 1223 break; 1224 case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: 1225 default: 1226 childrenAddress = NO_CHILDREN_ADDRESS; 1227 break; 1228 } 1229 ArrayList<WeightedString> shortcutTargets = null; 1230 if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { 1231 final int pointerBefore = buffer.position(); 1232 shortcutTargets = new ArrayList<WeightedString>(); 1233 buffer.readUnsignedShort(); // Skip the size 1234 while (true) { 1235 final int targetFlags = buffer.readUnsignedByte(); 1236 final String word = CharEncoding.readString(buffer); 1237 shortcutTargets.add(new WeightedString(word, 1238 targetFlags & FLAG_ATTRIBUTE_FREQUENCY)); 1239 if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; 1240 } 1241 addressPointer += buffer.position() - pointerBefore; 1242 } 1243 ArrayList<PendingAttribute> bigrams = null; 1244 if (0 != (flags & FLAG_HAS_BIGRAMS)) { 1245 bigrams = new ArrayList<PendingAttribute>(); 1246 while (true) { 1247 final int bigramFlags = buffer.readUnsignedByte(); 1248 ++addressPointer; 1249 final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; 1250 int bigramAddress = addressPointer; 1251 switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { 1252 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 1253 bigramAddress += sign * buffer.readUnsignedByte(); 1254 addressPointer += 1; 1255 break; 1256 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 1257 bigramAddress += sign * buffer.readUnsignedShort(); 1258 addressPointer += 2; 1259 break; 1260 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 1261 final int offset = (buffer.readUnsignedByte() << 16) 1262 + buffer.readUnsignedShort(); 1263 bigramAddress += sign * offset; 1264 addressPointer += 3; 1265 break; 1266 default: 1267 throw new RuntimeException("Has bigrams with no address"); 1268 } 1269 bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY, 1270 bigramAddress)); 1271 if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; 1272 } 1273 } 1274 return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, 1275 childrenAddress, shortcutTargets, bigrams); 1276 } 1277 1278 /** 1279 * Reads and returns the char group count out of a buffer and forwards the pointer. 1280 */ 1281 private static int readCharGroupCount(final FusionDictionaryBufferInterface buffer) { 1282 final int msb = buffer.readUnsignedByte(); 1283 if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) { 1284 return msb; 1285 } else { 1286 return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8) 1287 + buffer.readUnsignedByte(); 1288 } 1289 } 1290 1291 // The word cache here is a stopgap bandaid to help the catastrophic performance 1292 // of this method. Since it performs direct, unbuffered random access to the file and 1293 // may be called hundreds of thousands of times, the resulting performance is not 1294 // reasonable without some kind of cache. Thus: 1295 private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>(); 1296 /** 1297 * Finds, as a string, the word at the address passed as an argument. 1298 * 1299 * @param buffer the buffer to read from. 1300 * @param headerSize the size of the header. 1301 * @param address the address to seek. 1302 * @return the word, as a string. 1303 */ 1304 private static String getWordAtAddress(final FusionDictionaryBufferInterface buffer, 1305 final int headerSize, final int address) { 1306 final String cachedString = wordCache.get(address); 1307 if (null != cachedString) return cachedString; 1308 final int originalPointer = buffer.position(); 1309 buffer.position(headerSize); 1310 final int count = readCharGroupCount(buffer); 1311 int groupOffset = getGroupCountSize(count); 1312 final StringBuilder builder = new StringBuilder(); 1313 String result = null; 1314 1315 CharGroupInfo last = null; 1316 for (int i = count - 1; i >= 0; --i) { 1317 CharGroupInfo info = readCharGroup(buffer, groupOffset); 1318 groupOffset = info.mEndAddress; 1319 if (info.mOriginalAddress == address) { 1320 builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); 1321 result = builder.toString(); 1322 break; // and return 1323 } 1324 if (hasChildrenAddress(info.mChildrenAddress)) { 1325 if (info.mChildrenAddress > address) { 1326 if (null == last) continue; 1327 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 1328 buffer.position(last.mChildrenAddress + headerSize); 1329 groupOffset = last.mChildrenAddress + 1; 1330 i = buffer.readUnsignedByte(); 1331 last = null; 1332 continue; 1333 } 1334 last = info; 1335 } 1336 if (0 == i && hasChildrenAddress(last.mChildrenAddress)) { 1337 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 1338 buffer.position(last.mChildrenAddress + headerSize); 1339 groupOffset = last.mChildrenAddress + 1; 1340 i = buffer.readUnsignedByte(); 1341 last = null; 1342 continue; 1343 } 1344 } 1345 buffer.position(originalPointer); 1346 wordCache.put(address, result); 1347 return result; 1348 } 1349 1350 /** 1351 * Reads a single node from a buffer. 1352 * 1353 * This methods reads the file at the current position. A node is fully expected to start at 1354 * the current position. 1355 * This will recursively read other nodes into the structure, populating the reverse 1356 * maps on the fly and using them to keep track of already read nodes. 1357 * 1358 * @param buffer the buffer, correctly positioned at the start of a node. 1359 * @param headerSize the size, in bytes, of the file header. 1360 * @param reverseNodeMap a mapping from addresses to already read nodes. 1361 * @param reverseGroupMap a mapping from addresses to already read character groups. 1362 * @return the read node with all his children already read. 1363 */ 1364 private static Node readNode(final FusionDictionaryBufferInterface buffer, final int headerSize, 1365 final Map<Integer, Node> reverseNodeMap, final Map<Integer, CharGroup> reverseGroupMap) 1366 throws IOException { 1367 final int nodeOrigin = buffer.position() - headerSize; 1368 final int count = readCharGroupCount(buffer); 1369 final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>(); 1370 int groupOffset = nodeOrigin + getGroupCountSize(count); 1371 for (int i = count; i > 0; --i) { 1372 CharGroupInfo info = readCharGroup(buffer, groupOffset); 1373 ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets; 1374 ArrayList<WeightedString> bigrams = null; 1375 if (null != info.mBigrams) { 1376 bigrams = new ArrayList<WeightedString>(); 1377 for (PendingAttribute bigram : info.mBigrams) { 1378 final String word = getWordAtAddress( 1379 buffer, headerSize, bigram.mAddress); 1380 bigrams.add(new WeightedString(word, bigram.mFrequency)); 1381 } 1382 } 1383 if (hasChildrenAddress(info.mChildrenAddress)) { 1384 Node children = reverseNodeMap.get(info.mChildrenAddress); 1385 if (null == children) { 1386 final int currentPosition = buffer.position(); 1387 buffer.position(info.mChildrenAddress + headerSize); 1388 children = readNode( 1389 buffer, headerSize, reverseNodeMap, reverseGroupMap); 1390 buffer.position(currentPosition); 1391 } 1392 nodeContents.add( 1393 new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, 1394 0 != (info.mFlags & FLAG_IS_NOT_A_WORD), 1395 0 != (info.mFlags & FLAG_IS_BLACKLISTED), children)); 1396 } else { 1397 nodeContents.add( 1398 new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, 1399 0 != (info.mFlags & FLAG_IS_NOT_A_WORD), 1400 0 != (info.mFlags & FLAG_IS_BLACKLISTED))); 1401 } 1402 groupOffset = info.mEndAddress; 1403 } 1404 final Node node = new Node(nodeContents); 1405 node.mCachedAddress = nodeOrigin; 1406 reverseNodeMap.put(node.mCachedAddress, node); 1407 return node; 1408 } 1409 1410 // TODO: move these methods (readUnigramsAndBigramsBinary(|Inner)) and an inner class (Position) 1411 // out of this class. 1412 private static class Position { 1413 public static final int NOT_READ_GROUPCOUNT = -1; 1414 1415 public int mAddress; 1416 public int mNumOfCharGroup; 1417 public int mPosition; 1418 public int mLength; 1419 1420 public Position(int address, int length) { 1421 mAddress = address; 1422 mLength = length; 1423 mNumOfCharGroup = NOT_READ_GROUPCOUNT; 1424 } 1425 } 1426 1427 /** 1428 * Tours all node without recursive call. 1429 */ 1430 private static void readUnigramsAndBigramsBinaryInner( 1431 final FusionDictionaryBufferInterface buffer, final int headerSize, 1432 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 1433 final Map<Integer, ArrayList<PendingAttribute>> bigrams) { 1434 int[] pushedChars = new int[MAX_WORD_LENGTH + 1]; 1435 1436 Stack<Position> stack = new Stack<Position>(); 1437 int index = 0; 1438 1439 Position initPos = new Position(headerSize, 0); 1440 stack.push(initPos); 1441 1442 while (!stack.empty()) { 1443 Position p = stack.peek(); 1444 1445 if (DBG) { 1446 MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" + 1447 p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength); 1448 } 1449 1450 if (buffer.position() != p.mAddress) buffer.position(p.mAddress); 1451 if (index != p.mLength) index = p.mLength; 1452 1453 if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) { 1454 p.mNumOfCharGroup = readCharGroupCount(buffer); 1455 p.mAddress += getGroupCountSize(p.mNumOfCharGroup); 1456 p.mPosition = 0; 1457 } 1458 1459 CharGroupInfo info = readCharGroup(buffer, p.mAddress - headerSize); 1460 for (int i = 0; i < info.mCharacters.length; ++i) { 1461 pushedChars[index++] = info.mCharacters[i]; 1462 } 1463 p.mPosition++; 1464 1465 if (info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) { // found word 1466 words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); 1467 frequencies.put(info.mOriginalAddress, info.mFrequency); 1468 if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); 1469 } 1470 1471 if (p.mPosition == p.mNumOfCharGroup) { 1472 stack.pop(); 1473 } else { 1474 // the node has more groups. 1475 p.mAddress = buffer.position(); 1476 } 1477 1478 if (hasChildrenAddress(info.mChildrenAddress)) { 1479 Position childrenPos = new Position(info.mChildrenAddress + headerSize, index); 1480 stack.push(childrenPos); 1481 } 1482 } 1483 } 1484 1485 /** 1486 * Reads unigrams and bigrams from the binary file. 1487 * Doesn't make the memory representation of the dictionary. 1488 * 1489 * @param buffer the buffer to read. 1490 * @param words the map to store the address as a key and the word as a value. 1491 * @param frequencies the map to store the address as a key and the frequency as a value. 1492 * @param bigrams the map to store the address as a key and the list of address as a value. 1493 * @throws IOException 1494 * @throws UnsupportedFormatException 1495 */ 1496 public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer, 1497 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 1498 final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, 1499 UnsupportedFormatException { 1500 // Read header 1501 FormatOptions formatOptions = null; 1502 DictionaryOptions dictionaryOptions = null; 1503 final FileHeader header = readHeader(buffer); 1504 1505 readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams); 1506 } 1507 1508 /** 1509 * Helper function to get the binary format version from the header. 1510 * @throws IOException 1511 */ 1512 private static int getFormatVersion(final FusionDictionaryBufferInterface buffer) 1513 throws IOException { 1514 final int magic_v1 = buffer.readUnsignedShort(); 1515 if (VERSION_1_MAGIC_NUMBER == magic_v1) return buffer.readUnsignedByte(); 1516 final int magic_v2 = (magic_v1 << 16) + buffer.readUnsignedShort(); 1517 if (VERSION_2_MAGIC_NUMBER == magic_v2) return buffer.readUnsignedShort(); 1518 return NOT_A_VERSION_NUMBER; 1519 } 1520 1521 /** 1522 * Helper function to get and validate the binary format version. 1523 * @throws UnsupportedFormatException 1524 * @throws IOException 1525 */ 1526 private static int checkFormatVersion(final FusionDictionaryBufferInterface buffer) 1527 throws IOException, UnsupportedFormatException { 1528 final int version = getFormatVersion(buffer); 1529 if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { 1530 throw new UnsupportedFormatException("This file has version " + version 1531 + ", but this implementation does not support versions above " 1532 + MAXIMUM_SUPPORTED_VERSION); 1533 } 1534 return version; 1535 } 1536 1537 /** 1538 * Reads a header from a buffer. 1539 * @param buffer the buffer to read. 1540 * @throws IOException 1541 * @throws UnsupportedFormatException 1542 */ 1543 private static FileHeader readHeader(final FusionDictionaryBufferInterface buffer) 1544 throws IOException, UnsupportedFormatException { 1545 final int version = checkFormatVersion(buffer); 1546 final int optionsFlags = buffer.readUnsignedShort(); 1547 1548 final HashMap<String, String> attributes = new HashMap<String, String>(); 1549 final int headerSize; 1550 if (version < FIRST_VERSION_WITH_HEADER_SIZE) { 1551 headerSize = buffer.position(); 1552 } else { 1553 headerSize = buffer.readInt(); 1554 populateOptions(buffer, headerSize, attributes); 1555 buffer.position(headerSize); 1556 } 1557 1558 if (headerSize < 0) { 1559 throw new UnsupportedFormatException("header size can't be negative."); 1560 } 1561 1562 final FileHeader header = new FileHeader(headerSize, 1563 new FusionDictionary.DictionaryOptions(attributes, 1564 0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG), 1565 0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)), 1566 new FormatOptions(version)); 1567 return header; 1568 } 1569 1570 /** 1571 * Reads options from a buffer and populate a map with their contents. 1572 * 1573 * The buffer is read at the current position, so the caller must take care the pointer 1574 * is in the right place before calling this. 1575 */ 1576 public static void populateOptions(final FusionDictionaryBufferInterface buffer, 1577 final int headerSize, final HashMap<String, String> options) { 1578 while (buffer.position() < headerSize) { 1579 final String key = CharEncoding.readString(buffer); 1580 final String value = CharEncoding.readString(buffer); 1581 options.put(key, value); 1582 } 1583 } 1584 // TODO: remove this method. 1585 public static void populateOptions(final ByteBuffer buffer, final int headerSize, 1586 final HashMap<String, String> options) { 1587 populateOptions(new ByteBufferWrapper(buffer), headerSize, options); 1588 } 1589 1590 /** 1591 * Reads a buffer and returns the memory representation of the dictionary. 1592 * 1593 * This high-level method takes a buffer and reads its contents, populating a 1594 * FusionDictionary structure. The optional dict argument is an existing dictionary to 1595 * which words from the buffer should be added. If it is null, a new dictionary is created. 1596 * 1597 * @param buffer the buffer to read. 1598 * @param dict an optional dictionary to add words to, or null. 1599 * @return the created (or merged) dictionary. 1600 */ 1601 public static FusionDictionary readDictionaryBinary( 1602 final FusionDictionaryBufferInterface buffer, final FusionDictionary dict) 1603 throws IOException, UnsupportedFormatException { 1604 // clear cache 1605 wordCache.clear(); 1606 1607 // Read header 1608 final FileHeader header = readHeader(buffer); 1609 1610 Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>(); 1611 Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>(); 1612 final Node root = readNode(buffer, header.mHeaderSize, reverseNodeMapping, 1613 reverseGroupMapping); 1614 1615 FusionDictionary newDict = new FusionDictionary(root, header.mDictionaryOptions); 1616 if (null != dict) { 1617 for (final Word w : dict) { 1618 if (w.mIsBlacklistEntry) { 1619 newDict.addBlacklistEntry(w.mWord, w.mShortcutTargets, w.mIsNotAWord); 1620 } else { 1621 newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mIsNotAWord); 1622 } 1623 } 1624 for (final Word w : dict) { 1625 // By construction a binary dictionary may not have bigrams pointing to 1626 // words that are not also registered as unigrams so we don't have to avoid 1627 // them explicitly here. 1628 for (final WeightedString bigram : w.mBigrams) { 1629 newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency); 1630 } 1631 } 1632 } 1633 1634 return newDict; 1635 } 1636 1637 // TODO: remove this method. 1638 public static FusionDictionary readDictionaryBinary(final ByteBuffer buffer, 1639 final FusionDictionary dict) throws IOException, UnsupportedFormatException { 1640 return readDictionaryBinary(new ByteBufferWrapper(buffer), dict); 1641 } 1642 1643 /** 1644 * Basic test to find out whether the file is a binary dictionary or not. 1645 * 1646 * Concretely this only tests the magic number. 1647 * 1648 * @param filename The name of the file to test. 1649 * @return true if it's a binary dictionary, false otherwise 1650 */ 1651 public static boolean isBinaryDictionary(final String filename) { 1652 FileInputStream inStream = null; 1653 try { 1654 final File file = new File(filename); 1655 inStream = new FileInputStream(file); 1656 final ByteBuffer buffer = inStream.getChannel().map( 1657 FileChannel.MapMode.READ_ONLY, 0, file.length()); 1658 final int version = getFormatVersion(new ByteBufferWrapper(buffer)); 1659 return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION); 1660 } catch (FileNotFoundException e) { 1661 return false; 1662 } catch (IOException e) { 1663 return false; 1664 } finally { 1665 if (inStream != null) { 1666 try { 1667 inStream.close(); 1668 } catch (IOException e) { 1669 // do nothing 1670 } 1671 } 1672 } 1673 } 1674 1675 /** 1676 * Calculate bigram frequency from compressed value 1677 * 1678 * @see #makeBigramFlags 1679 * 1680 * @param unigramFrequency 1681 * @param bigramFrequency compressed frequency 1682 * @return approximate bigram frequency 1683 */ 1684 public static int reconstructBigramFrequency(final int unigramFrequency, 1685 final int bigramFrequency) { 1686 final float stepSize = (MAX_TERMINAL_FREQUENCY - unigramFrequency) 1687 / (1.5f + MAX_BIGRAM_FREQUENCY); 1688 final float resultFreqFloat = (float)unigramFrequency 1689 + stepSize * (bigramFrequency + 1.0f); 1690 return (int)resultFreqFloat; 1691 } 1692} 1693