BinaryDictInputOutput.java revision 752996540ff3a6dd5b48819849c06355c4270e03
1/* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.latin.makedict; 18 19import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; 20import com.android.inputmethod.latin.makedict.FusionDictionary.Node; 21import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; 22 23import java.io.FileNotFoundException; 24import java.io.IOException; 25import java.io.OutputStream; 26import java.io.RandomAccessFile; 27import java.util.ArrayList; 28import java.util.Arrays; 29import java.util.Iterator; 30import java.util.Map; 31import java.util.TreeMap; 32 33/** 34 * Reads and writes XML files for a FusionDictionary. 35 * 36 * All the methods in this class are static. 37 */ 38public class BinaryDictInputOutput { 39 40 /* Node layout is as follows: 41 * | addressType xx : mask with MASK_GROUP_ADDRESS_TYPE 42 * 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS 43 * f | 01 = 1 byte : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE 44 * l | 10 = 2 bytes : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES 45 * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES 46 * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS 47 * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL 48 * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS 49 * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS 50 * 51 * c | IF FLAG_HAS_MULTIPLE_CHARS 52 * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers 53 * a | end 1 byte, = 0 54 * r | ELSE 55 * s | char 1 or 3 bytes 56 * | END 57 * 58 * f | 59 * r | IF FLAG_IS_TERMINAL 60 * e | frequency 1 byte 61 * q | 62 * 63 * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType 64 * h | // nothing 65 * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType 66 * l | children address, 1 byte 67 * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType 68 * r | children address, 2 bytes 69 * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType 70 * n | children address, 3 bytes 71 * A | END 72 * d 73 * dress 74 * 75 * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS 76 * | shortcut string list 77 * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS 78 * | bigrams address list 79 * 80 * Char format is: 81 * 1 byte = bbbbbbbb match 82 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 83 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 84 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 85 * 00011111 would be outside unicode. 86 * else: iso-latin-1 code 87 * This allows for the whole unicode range to be encoded, including chars outside of 88 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 89 * characters which should never happen anyway (and still work, but take 3 bytes). 90 * 91 * bigram address list is: 92 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 93 * | addressSign = 1 bit, : FLAG_ATTRIBUTE_OFFSET_NEGATIVE 94 * | 1 = must take -address, 0 = must take +address 95 * | xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE 96 * | addressFormat = 2 bits, 00 = unused : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 97 * | 01 = 1 byte : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE 98 * | 10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES 99 * | 11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES 100 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 101 * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat) 102 * | read 1 byte, add top 4 bits 103 * | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat) 104 * | read 2 bytes, add top 4 bits 105 * | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat 106 * | read 3 bytes, add top 4 bits 107 * | END 108 * | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address 109 * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is 110 * 111 * shortcut string list is: 112 * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes. 113 * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT 114 * | reserved = 3 bits, must be 0 115 * | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY 116 * <shortcut> = | string of characters at the char format described above, with the terminator 117 * | used to signal the end of the string. 118 * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags 119 */ 120 121 private static final int VERSION_1_MAGIC_NUMBER = 0x78B1; 122 private static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; 123 private static final int MINIMUM_SUPPORTED_VERSION = 1; 124 private static final int MAXIMUM_SUPPORTED_VERSION = 2; 125 private static final int NOT_A_VERSION_NUMBER = -1; 126 private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2; 127 128 // No options yet, reserved for future use. 129 private static final int OPTIONS = 0; 130 131 // TODO: Make this value adaptative to content data, store it in the header, and 132 // use it in the reading code. 133 private static final int MAX_WORD_LENGTH = 48; 134 135 private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0; 136 private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; 137 private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; 138 private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; 139 private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; 140 141 private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; 142 143 private static final int FLAG_IS_TERMINAL = 0x10; 144 private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; 145 private static final int FLAG_HAS_BIGRAMS = 0x04; 146 147 private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; 148 private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; 149 private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; 150 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; 151 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; 152 private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; 153 private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F; 154 155 private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F; 156 157 private static final int GROUP_TERMINATOR_SIZE = 1; 158 private static final int GROUP_FLAGS_SIZE = 1; 159 private static final int GROUP_FREQUENCY_SIZE = 1; 160 private static final int GROUP_MAX_ADDRESS_SIZE = 3; 161 private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1; 162 private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3; 163 private static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2; 164 165 private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE; 166 private static final int INVALID_CHARACTER = -1; 167 168 private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127 169 private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 170 171 private static final int MAX_TERMINAL_FREQUENCY = 255; 172 173 /** 174 * A class grouping utility function for our specific character encoding. 175 */ 176 private static class CharEncoding { 177 178 private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; 179 private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF; 180 181 /** 182 * Helper method to find out whether this code fits on one byte 183 */ 184 private static boolean fitsOnOneByte(int character) { 185 return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE 186 && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE; 187 } 188 189 /** 190 * Compute the size of a character given its character code. 191 * 192 * Char format is: 193 * 1 byte = bbbbbbbb match 194 * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte 195 * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because 196 * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with 197 * 00011111 would be outside unicode. 198 * else: iso-latin-1 code 199 * This allows for the whole unicode range to be encoded, including chars outside of 200 * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control 201 * characters which should never happen anyway (and still work, but take 3 bytes). 202 * 203 * @param character the character code. 204 * @return the size in binary encoded-form, either 1 or 3 bytes. 205 */ 206 private static int getCharSize(int character) { 207 // See char encoding in FusionDictionary.java 208 if (fitsOnOneByte(character)) return 1; 209 if (INVALID_CHARACTER == character) return 1; 210 return 3; 211 } 212 213 /** 214 * Compute the byte size of a character array. 215 */ 216 private static int getCharArraySize(final int[] chars) { 217 int size = 0; 218 for (int character : chars) size += getCharSize(character); 219 return size; 220 } 221 222 /** 223 * Writes a char array to a byte buffer. 224 * 225 * @param codePoints the code point array to write. 226 * @param buffer the byte buffer to write to. 227 * @param index the index in buffer to write the character array to. 228 * @return the index after the last character. 229 */ 230 private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) { 231 for (int codePoint : codePoints) { 232 if (1 == getCharSize(codePoint)) { 233 buffer[index++] = (byte)codePoint; 234 } else { 235 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 236 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 237 buffer[index++] = (byte)(0xFF & codePoint); 238 } 239 } 240 return index; 241 } 242 243 /** 244 * Writes a string with our character format to a byte buffer. 245 * 246 * This will also write the terminator byte. 247 * 248 * @param buffer the byte buffer to write to. 249 * @param origin the offset to write from. 250 * @param word the string to write. 251 * @return the size written, in bytes. 252 */ 253 private static int writeString(final byte[] buffer, final int origin, 254 final String word) { 255 final int length = word.length(); 256 int index = origin; 257 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 258 final int codePoint = word.codePointAt(i); 259 if (1 == getCharSize(codePoint)) { 260 buffer[index++] = (byte)codePoint; 261 } else { 262 buffer[index++] = (byte)(0xFF & (codePoint >> 16)); 263 buffer[index++] = (byte)(0xFF & (codePoint >> 8)); 264 buffer[index++] = (byte)(0xFF & codePoint); 265 } 266 } 267 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 268 return index - origin; 269 } 270 271 /** 272 * Reads a string from a RandomAccessFile. This is the converse of the above method. 273 */ 274 private static String readString(final RandomAccessFile source) throws IOException { 275 final StringBuilder s = new StringBuilder(); 276 int character = readChar(source); 277 while (character != INVALID_CHARACTER) { 278 s.appendCodePoint(character); 279 character = readChar(source); 280 } 281 return s.toString(); 282 } 283 284 /** 285 * Reads a character from the file. 286 * 287 * This follows the character format documented earlier in this source file. 288 * 289 * @param source the file, positioned over an encoded character. 290 * @return the character code. 291 */ 292 private static int readChar(RandomAccessFile source) throws IOException { 293 int character = source.readUnsignedByte(); 294 if (!fitsOnOneByte(character)) { 295 if (GROUP_CHARACTERS_TERMINATOR == character) 296 return INVALID_CHARACTER; 297 character <<= 16; 298 character += source.readUnsignedShort(); 299 } 300 return character; 301 } 302 } 303 304 /** 305 * Compute the binary size of the character array in a group 306 * 307 * If only one character, this is the size of this character. If many, it's the sum of their 308 * sizes + 1 byte for the terminator. 309 * 310 * @param group the group 311 * @return the size of the char array, including the terminator if any 312 */ 313 private static int getGroupCharactersSize(CharGroup group) { 314 int size = CharEncoding.getCharArraySize(group.mChars); 315 if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE; 316 return size; 317 } 318 319 /** 320 * Compute the binary size of the group count 321 * @param count the group count 322 * @return the size of the group count, either 1 or 2 bytes. 323 */ 324 private static int getGroupCountSize(final int count) { 325 if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) { 326 return 1; 327 } else if (MAX_CHARGROUPS_IN_A_NODE >= count) { 328 return 2; 329 } else { 330 throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE 331 + " groups in a node (found " + count +")"); 332 } 333 } 334 335 /** 336 * Compute the binary size of the group count for a node 337 * @param node the node 338 * @return the size of the group count, either 1 or 2 bytes. 339 */ 340 private static int getGroupCountSize(final Node node) { 341 return getGroupCountSize(node.mData.size()); 342 } 343 344 /** 345 * Compute the size of a shortcut in bytes. 346 */ 347 private static int getShortcutSize(final WeightedString shortcut) { 348 int size = GROUP_ATTRIBUTE_FLAGS_SIZE; 349 final String word = shortcut.mWord; 350 final int length = word.length(); 351 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 352 final int codePoint = word.codePointAt(i); 353 size += CharEncoding.getCharSize(codePoint); 354 } 355 size += GROUP_TERMINATOR_SIZE; 356 return size; 357 } 358 359 /** 360 * Compute the size of a shortcut list in bytes. 361 * 362 * This is known in advance and does not change according to position in the file 363 * like address lists do. 364 */ 365 private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { 366 if (null == shortcutList) return 0; 367 int size = GROUP_SHORTCUT_LIST_SIZE_SIZE; 368 for (final WeightedString shortcut : shortcutList) { 369 size += getShortcutSize(shortcut); 370 } 371 return size; 372 } 373 374 /** 375 * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything. 376 * 377 * @param group the CharGroup to compute the size of. 378 * @return the maximum size of the group. 379 */ 380 private static int getCharGroupMaximumSize(CharGroup group) { 381 int size = getGroupCharactersSize(group) + GROUP_FLAGS_SIZE; 382 // If terminal, one byte for the frequency 383 if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; 384 size += GROUP_MAX_ADDRESS_SIZE; // For children address 385 size += getShortcutListSize(group.mShortcutTargets); 386 if (null != group.mBigrams) { 387 size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) 388 * group.mBigrams.size(); 389 } 390 return size; 391 } 392 393 /** 394 * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches 395 * it in the 'actualSize' member of the node. 396 * 397 * @param node the node to compute the maximum size of. 398 */ 399 private static void setNodeMaximumSize(Node node) { 400 int size = getGroupCountSize(node); 401 for (CharGroup g : node.mData) { 402 final int groupSize = getCharGroupMaximumSize(g); 403 g.mCachedSize = groupSize; 404 size += groupSize; 405 } 406 node.mCachedSize = size; 407 } 408 409 /** 410 * Helper method to hide the actual value of the no children address. 411 */ 412 private static boolean hasChildrenAddress(int address) { 413 return NO_CHILDREN_ADDRESS != address; 414 } 415 416 /** 417 * Compute the size, in bytes, that an address will occupy. 418 * 419 * This can be used either for children addresses (which are always positive) or for 420 * attribute, which may be positive or negative but 421 * store their sign bit separately. 422 * 423 * @param address the address 424 * @return the byte size. 425 */ 426 private static int getByteSize(int address) { 427 assert(address < 0x1000000); 428 if (!hasChildrenAddress(address)) { 429 return 0; 430 } else if (Math.abs(address) < 0x100) { 431 return 1; 432 } else if (Math.abs(address) < 0x10000) { 433 return 2; 434 } else { 435 return 3; 436 } 437 } 438 // End utility methods. 439 440 // This method is responsible for finding a nice ordering of the nodes that favors run-time 441 // cache performance and dictionary size. 442 /* package for tests */ static ArrayList<Node> flattenTree(Node root) { 443 final int treeSize = FusionDictionary.countCharGroups(root); 444 MakedictLog.i("Counted nodes : " + treeSize); 445 final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize); 446 return flattenTreeInner(flatTree, root); 447 } 448 449 private static ArrayList<Node> flattenTreeInner(ArrayList<Node> list, Node node) { 450 // Removing the node is necessary if the tails are merged, because we would then 451 // add the same node several times when we only want it once. A number of places in 452 // the code also depends on any node being only once in the list. 453 // Merging tails can only be done if there are no attributes. Searching for attributes 454 // in LatinIME code depends on a total breadth-first ordering, which merging tails 455 // breaks. If there are no attributes, it should be fine (and reduce the file size) 456 // to merge tails, and the following step would be necessary. 457 // If eventually the code runs on Android, searching through the whole array each time 458 // may be a performance concern. 459 list.remove(node); 460 list.add(node); 461 final ArrayList<CharGroup> branches = node.mData; 462 final int nodeSize = branches.size(); 463 for (CharGroup group : branches) { 464 if (null != group.mChildren) flattenTreeInner(list, group.mChildren); 465 } 466 return list; 467 } 468 469 /** 470 * Finds the absolute address of a word in the dictionary. 471 * 472 * @param dict the dictionary in which to search. 473 * @param word the word we are searching for. 474 * @return the word address. If it is not found, an exception is thrown. 475 */ 476 private static int findAddressOfWord(final FusionDictionary dict, final String word) { 477 return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress; 478 } 479 480 /** 481 * Computes the actual node size, based on the cached addresses of the children nodes. 482 * 483 * Each node stores its tentative address. During dictionary address computing, these 484 * are not final, but they can be used to compute the node size (the node size depends 485 * on the address of the children because the number of bytes necessary to store an 486 * address depends on its numeric value. 487 * 488 * @param node the node to compute the size of. 489 * @param dict the dictionary in which the word/attributes are to be found. 490 */ 491 private static void computeActualNodeSize(Node node, FusionDictionary dict) { 492 int size = getGroupCountSize(node); 493 for (CharGroup group : node.mData) { 494 int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 495 if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE; 496 if (null != group.mChildren) { 497 final int offsetBasePoint= groupSize + node.mCachedAddress + size; 498 final int offset = group.mChildren.mCachedAddress - offsetBasePoint; 499 groupSize += getByteSize(offset); 500 } 501 groupSize += getShortcutListSize(group.mShortcutTargets); 502 if (null != group.mBigrams) { 503 for (WeightedString bigram : group.mBigrams) { 504 final int offsetBasePoint = groupSize + node.mCachedAddress + size 505 + GROUP_FLAGS_SIZE; 506 final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); 507 final int offset = addressOfBigram - offsetBasePoint; 508 groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; 509 } 510 } 511 group.mCachedSize = groupSize; 512 size += groupSize; 513 } 514 node.mCachedSize = size; 515 } 516 517 /** 518 * Computes the byte size of a list of nodes and updates each node cached position. 519 * 520 * @param flatNodes the array of nodes. 521 * @return the byte size of the entire stack. 522 */ 523 private static int stackNodes(ArrayList<Node> flatNodes) { 524 int nodeOffset = 0; 525 for (Node n : flatNodes) { 526 n.mCachedAddress = nodeOffset; 527 int groupCountSize = getGroupCountSize(n); 528 int groupOffset = 0; 529 for (CharGroup g : n.mData) { 530 g.mCachedAddress = groupCountSize + nodeOffset + groupOffset; 531 groupOffset += g.mCachedSize; 532 } 533 if (groupOffset + groupCountSize != n.mCachedSize) { 534 throw new RuntimeException("Bug : Stored and computed node size differ"); 535 } 536 nodeOffset += n.mCachedSize; 537 } 538 return nodeOffset; 539 } 540 541 /** 542 * Compute the addresses and sizes of an ordered node array. 543 * 544 * This method takes a node array and will update its cached address and size values 545 * so that they can be written into a file. It determines the smallest size each of the 546 * nodes can be given the addresses of its children and attributes, and store that into 547 * each node. 548 * The order of the node is given by the order of the array. This method makes no effort 549 * to find a good order; it only mechanically computes the size this order results in. 550 * 551 * @param dict the dictionary 552 * @param flatNodes the ordered array of nodes 553 * @return the same array it was passed. The nodes have been updated for address and size. 554 */ 555 private static ArrayList<Node> computeAddresses(FusionDictionary dict, 556 ArrayList<Node> flatNodes) { 557 // First get the worst sizes and offsets 558 for (Node n : flatNodes) setNodeMaximumSize(n); 559 final int offset = stackNodes(flatNodes); 560 561 MakedictLog.i("Compressing the array addresses. Original size : " + offset); 562 MakedictLog.i("(Recursively seen size : " + offset + ")"); 563 564 int passes = 0; 565 boolean changesDone = false; 566 do { 567 changesDone = false; 568 for (Node n : flatNodes) { 569 final int oldNodeSize = n.mCachedSize; 570 computeActualNodeSize(n, dict); 571 final int newNodeSize = n.mCachedSize; 572 if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!"); 573 if (oldNodeSize != newNodeSize) changesDone = true; 574 } 575 stackNodes(flatNodes); 576 ++passes; 577 } while (changesDone); 578 579 final Node lastNode = flatNodes.get(flatNodes.size() - 1); 580 MakedictLog.i("Compression complete in " + passes + " passes."); 581 MakedictLog.i("After address compression : " 582 + (lastNode.mCachedAddress + lastNode.mCachedSize)); 583 584 return flatNodes; 585 } 586 587 /** 588 * Sanity-checking method. 589 * 590 * This method checks an array of node for juxtaposition, that is, it will do 591 * nothing if each node's cached address is actually the previous node's address 592 * plus the previous node's size. 593 * If this is not the case, it will throw an exception. 594 * 595 * @param array the array node to check 596 */ 597 private static void checkFlatNodeArray(ArrayList<Node> array) { 598 int offset = 0; 599 int index = 0; 600 for (Node n : array) { 601 if (n.mCachedAddress != offset) { 602 throw new RuntimeException("Wrong address for node " + index 603 + " : expected " + offset + ", got " + n.mCachedAddress); 604 } 605 ++index; 606 offset += n.mCachedSize; 607 } 608 } 609 610 /** 611 * Helper method to write a variable-size address to a file. 612 * 613 * @param buffer the buffer to write to. 614 * @param index the index in the buffer to write the address to. 615 * @param address the address to write. 616 * @return the size in bytes the address actually took. 617 */ 618 private static int writeVariableAddress(final byte[] buffer, int index, final int address) { 619 switch (getByteSize(address)) { 620 case 1: 621 buffer[index++] = (byte)address; 622 return 1; 623 case 2: 624 buffer[index++] = (byte)(0xFF & (address >> 8)); 625 buffer[index++] = (byte)(0xFF & address); 626 return 2; 627 case 3: 628 buffer[index++] = (byte)(0xFF & (address >> 16)); 629 buffer[index++] = (byte)(0xFF & (address >> 8)); 630 buffer[index++] = (byte)(0xFF & address); 631 return 3; 632 case 0: 633 return 0; 634 default: 635 throw new RuntimeException("Address " + address + " has a strange size"); 636 } 637 } 638 639 private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress, 640 final int childrenOffset) { 641 byte flags = 0; 642 if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS; 643 if (group.mFrequency >= 0) { 644 flags |= FLAG_IS_TERMINAL; 645 } 646 if (null != group.mChildren) { 647 switch (getByteSize(childrenOffset)) { 648 case 1: 649 flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; 650 break; 651 case 2: 652 flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; 653 break; 654 case 3: 655 flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; 656 break; 657 default: 658 throw new RuntimeException("Node with a strange address"); 659 } 660 } 661 if (null != group.mShortcutTargets) { 662 if (0 == group.mShortcutTargets.size()) { 663 throw new RuntimeException("0-sized shortcut list must be null"); 664 } 665 flags |= FLAG_HAS_SHORTCUT_TARGETS; 666 } 667 if (null != group.mBigrams) { 668 if (0 == group.mBigrams.size()) { 669 throw new RuntimeException("0-sized bigram list must be null"); 670 } 671 flags |= FLAG_HAS_BIGRAMS; 672 } 673 return flags; 674 } 675 676 /** 677 * Makes the flag value for an attribute. 678 * 679 * @param more whether there are more attributes after this one. 680 * @param offset the offset of the attribute. 681 * @param frequency the frequency of the attribute, 0..15 682 * @return the flags 683 */ 684 private static final int makeAttributeFlags(final boolean more, final int offset, 685 final int frequency) { 686 int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) 687 + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); 688 switch (getByteSize(offset)) { 689 case 1: 690 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; 691 break; 692 case 2: 693 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; 694 break; 695 case 3: 696 bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; 697 break; 698 default: 699 throw new RuntimeException("Strange offset size"); 700 } 701 bigramFlags += frequency & FLAG_ATTRIBUTE_FREQUENCY; 702 return bigramFlags; 703 } 704 705 /** 706 * Makes the flag value for a shortcut. 707 * 708 * @param more whether there are more attributes after this one. 709 * @param frequency the frequency of the attribute, 0..15 710 * @return the flags 711 */ 712 private static final int makeShortcutFlags(final boolean more, final int frequency) { 713 return (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FLAG_ATTRIBUTE_FREQUENCY); 714 } 715 716 /** 717 * Write a node to memory. The node is expected to have its final position cached. 718 * 719 * This can be an empty map, but the more is inside the faster the lookups will be. It can 720 * be carried on as long as nodes do not move. 721 * 722 * @param dict the dictionary the node is a part of (for relative offsets). 723 * @param buffer the memory buffer to write to. 724 * @param node the node to write. 725 * @return the address of the END of the node. 726 */ 727 private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) { 728 int index = node.mCachedAddress; 729 730 final int groupCount = node.mData.size(); 731 final int countSize = getGroupCountSize(node); 732 if (1 == countSize) { 733 buffer[index++] = (byte)groupCount; 734 } else if (2 == countSize) { 735 // We need to signal 2-byte size by setting the top bit of the MSB to 1, so 736 // we | 0x80 to do this. 737 buffer[index++] = (byte)((groupCount >> 8) | 0x80); 738 buffer[index++] = (byte)(groupCount & 0xFF); 739 } else { 740 throw new RuntimeException("Strange size from getGroupCountSize : " + countSize); 741 } 742 int groupAddress = index; 743 for (int i = 0; i < groupCount; ++i) { 744 CharGroup group = node.mData.get(i); 745 if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not " 746 + "the same as the cached address of the group : " 747 + index + " <> " + group.mCachedAddress); 748 groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group); 749 // Sanity checks. 750 if (group.mFrequency > MAX_TERMINAL_FREQUENCY) { 751 throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY 752 + " : " + group.mFrequency); 753 } 754 if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE; 755 final int childrenOffset = null == group.mChildren 756 ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress; 757 byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset); 758 buffer[index++] = flags; 759 index = CharEncoding.writeCharArray(group.mChars, buffer, index); 760 if (group.hasSeveralChars()) { 761 buffer[index++] = GROUP_CHARACTERS_TERMINATOR; 762 } 763 if (group.mFrequency >= 0) { 764 buffer[index++] = (byte) group.mFrequency; 765 } 766 final int shift = writeVariableAddress(buffer, index, childrenOffset); 767 index += shift; 768 groupAddress += shift; 769 770 // Write shortcuts 771 if (null != group.mShortcutTargets) { 772 final int indexOfShortcutByteSize = index; 773 index += GROUP_SHORTCUT_LIST_SIZE_SIZE; 774 groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE; 775 final Iterator shortcutIterator = group.mShortcutTargets.iterator(); 776 while (shortcutIterator.hasNext()) { 777 final WeightedString target = (WeightedString)shortcutIterator.next(); 778 ++groupAddress; 779 int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(), 780 target.mFrequency); 781 buffer[index++] = (byte)shortcutFlags; 782 final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord); 783 index += shortcutShift; 784 groupAddress += shortcutShift; 785 } 786 final int shortcutByteSize = index - indexOfShortcutByteSize; 787 if (shortcutByteSize > 0xFFFF) { 788 throw new RuntimeException("Shortcut list too large"); 789 } 790 buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8); 791 buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF); 792 } 793 // Write bigrams 794 if (null != group.mBigrams) { 795 final Iterator bigramIterator = group.mBigrams.iterator(); 796 while (bigramIterator.hasNext()) { 797 final WeightedString bigram = (WeightedString)bigramIterator.next(); 798 final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); 799 ++groupAddress; 800 final int offset = addressOfBigram - groupAddress; 801 int bigramFlags = makeAttributeFlags(bigramIterator.hasNext(), offset, 802 bigram.mFrequency); 803 buffer[index++] = (byte)bigramFlags; 804 final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); 805 index += bigramShift; 806 groupAddress += bigramShift; 807 } 808 } 809 810 } 811 if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException( 812 "Not the same size : written " 813 + (index - node.mCachedAddress) + " bytes out of a node that should have " 814 + node.mCachedSize + " bytes"); 815 return index; 816 } 817 818 /** 819 * Dumps a collection of useful statistics about a node array. 820 * 821 * This prints purely informative stuff, like the total estimated file size, the 822 * number of nodes, of character groups, the repartition of each address size, etc 823 * 824 * @param nodes the node array. 825 */ 826 private static void showStatistics(ArrayList<Node> nodes) { 827 int firstTerminalAddress = Integer.MAX_VALUE; 828 int lastTerminalAddress = Integer.MIN_VALUE; 829 int size = 0; 830 int charGroups = 0; 831 int maxGroups = 0; 832 int maxRuns = 0; 833 for (Node n : nodes) { 834 if (maxGroups < n.mData.size()) maxGroups = n.mData.size(); 835 for (CharGroup cg : n.mData) { 836 ++charGroups; 837 if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length; 838 if (cg.mFrequency >= 0) { 839 if (n.mCachedAddress < firstTerminalAddress) 840 firstTerminalAddress = n.mCachedAddress; 841 if (n.mCachedAddress > lastTerminalAddress) 842 lastTerminalAddress = n.mCachedAddress; 843 } 844 } 845 if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize; 846 } 847 final int[] groupCounts = new int[maxGroups + 1]; 848 final int[] runCounts = new int[maxRuns + 1]; 849 for (Node n : nodes) { 850 ++groupCounts[n.mData.size()]; 851 for (CharGroup cg : n.mData) { 852 ++runCounts[cg.mChars.length]; 853 } 854 } 855 856 MakedictLog.i("Statistics:\n" 857 + " total file size " + size + "\n" 858 + " " + nodes.size() + " nodes\n" 859 + " " + charGroups + " groups (" + ((float)charGroups / nodes.size()) 860 + " groups per node)\n" 861 + " first terminal at " + firstTerminalAddress + "\n" 862 + " last terminal at " + lastTerminalAddress + "\n" 863 + " Group stats : max = " + maxGroups); 864 for (int i = 0; i < groupCounts.length; ++i) { 865 MakedictLog.i(" " + i + " : " + groupCounts[i]); 866 } 867 MakedictLog.i(" Character run stats : max = " + maxRuns); 868 for (int i = 0; i < runCounts.length; ++i) { 869 MakedictLog.i(" " + i + " : " + runCounts[i]); 870 } 871 } 872 873 /** 874 * Dumps a FusionDictionary to a file. 875 * 876 * This is the public entry point to write a dictionary to a file. 877 * 878 * @param destination the stream to write the binary data to. 879 * @param dict the dictionary to write. 880 * @param version the version of the format to write, currently either 1 or 2. 881 */ 882 public static void writeDictionaryBinary(final OutputStream destination, 883 final FusionDictionary dict, final int version) 884 throws IOException, UnsupportedFormatException { 885 886 // Addresses are limited to 3 bytes, so we'll just make a 16MB buffer. Since addresses 887 // can be relative to each node, the structure itself is not limited to 16MB at all, but 888 // I doubt this will ever be shot. If it is, deciding the order of the nodes becomes 889 // a quite complicated problem, because though the dictionary itself does not have a 890 // size limit, each node must still be within 16MB of all its children and parents. 891 // As long as this is ensured, the dictionary file may grow to any size. 892 // Anyway, to make a dictionary bigger than 16MB just increase the size of this buffer. 893 final byte[] buffer = new byte[1 << 24]; 894 int index = 0; 895 896 if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) { 897 throw new UnsupportedFormatException("Requested file format version " + version 898 + ", but this implementation only supports versions " 899 + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION); 900 } 901 902 // The magic number in big-endian order. 903 if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { 904 // Magic number for version 2+. 905 buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24)); 906 buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16)); 907 buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8)); 908 buffer[index++] = (byte) (0xFF & VERSION_2_MAGIC_NUMBER); 909 // Dictionary version. 910 buffer[index++] = (byte) (0xFF & (version >> 8)); 911 buffer[index++] = (byte) (0xFF & version); 912 } else { 913 // Magic number for version 1. 914 buffer[index++] = (byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8)); 915 buffer[index++] = (byte) (0xFF & VERSION_1_MAGIC_NUMBER); 916 // Dictionary version. 917 buffer[index++] = (byte) (0xFF & version); 918 } 919 // Options flags 920 buffer[index++] = (byte) (0xFF & (OPTIONS >> 8)); 921 buffer[index++] = (byte) (0xFF & OPTIONS); 922 if (version >= FIRST_VERSION_WITH_HEADER_SIZE) { 923 final int headerSizeOffset = index; 924 index += 4; // Size of the header size 925 // TODO: Write out the header contents here. 926 // Write out the header size. 927 buffer[headerSizeOffset] = (byte) (0xFF & (index >> 24)); 928 buffer[headerSizeOffset + 1] = (byte) (0xFF & (index >> 16)); 929 buffer[headerSizeOffset + 2] = (byte) (0xFF & (index >> 8)); 930 buffer[headerSizeOffset + 3] = (byte) (0xFF & (index >> 0)); 931 } 932 933 destination.write(buffer, 0, index); 934 index = 0; 935 936 // Leave the choice of the optimal node order to the flattenTree function. 937 MakedictLog.i("Flattening the tree..."); 938 ArrayList<Node> flatNodes = flattenTree(dict.mRoot); 939 940 MakedictLog.i("Computing addresses..."); 941 computeAddresses(dict, flatNodes); 942 MakedictLog.i("Checking array..."); 943 checkFlatNodeArray(flatNodes); 944 945 MakedictLog.i("Writing file..."); 946 int dataEndOffset = 0; 947 for (Node n : flatNodes) { 948 dataEndOffset = writePlacedNode(dict, buffer, n); 949 } 950 951 showStatistics(flatNodes); 952 953 destination.write(buffer, 0, dataEndOffset); 954 955 destination.close(); 956 MakedictLog.i("Done"); 957 } 958 959 960 // Input methods: Read a binary dictionary to memory. 961 // readDictionaryBinary is the public entry point for them. 962 963 static final int[] characterBuffer = new int[MAX_WORD_LENGTH]; 964 private static CharGroupInfo readCharGroup(RandomAccessFile source, 965 final int originalGroupAddress) throws IOException { 966 int addressPointer = originalGroupAddress; 967 final int flags = source.readUnsignedByte(); 968 ++addressPointer; 969 final int characters[]; 970 if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) { 971 int index = 0; 972 int character = CharEncoding.readChar(source); 973 addressPointer += CharEncoding.getCharSize(character); 974 while (-1 != character) { 975 characterBuffer[index++] = character; 976 character = CharEncoding.readChar(source); 977 addressPointer += CharEncoding.getCharSize(character); 978 } 979 characters = Arrays.copyOfRange(characterBuffer, 0, index); 980 } else { 981 final int character = CharEncoding.readChar(source); 982 addressPointer += CharEncoding.getCharSize(character); 983 characters = new int[] { character }; 984 } 985 final int frequency; 986 if (0 != (FLAG_IS_TERMINAL & flags)) { 987 ++addressPointer; 988 frequency = source.readUnsignedByte(); 989 } else { 990 frequency = CharGroup.NOT_A_TERMINAL; 991 } 992 int childrenAddress = addressPointer; 993 switch (flags & MASK_GROUP_ADDRESS_TYPE) { 994 case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: 995 childrenAddress += source.readUnsignedByte(); 996 addressPointer += 1; 997 break; 998 case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: 999 childrenAddress += source.readUnsignedShort(); 1000 addressPointer += 2; 1001 break; 1002 case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: 1003 childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort(); 1004 addressPointer += 3; 1005 break; 1006 case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS: 1007 default: 1008 childrenAddress = NO_CHILDREN_ADDRESS; 1009 break; 1010 } 1011 ArrayList<WeightedString> shortcutTargets = null; 1012 if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { 1013 final long pointerBefore = source.getFilePointer(); 1014 shortcutTargets = new ArrayList<WeightedString>(); 1015 source.readUnsignedShort(); // Skip the size 1016 while (true) { 1017 final int targetFlags = source.readUnsignedByte(); 1018 final String word = CharEncoding.readString(source); 1019 shortcutTargets.add(new WeightedString(word, 1020 targetFlags & FLAG_ATTRIBUTE_FREQUENCY)); 1021 if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; 1022 } 1023 addressPointer += (source.getFilePointer() - pointerBefore); 1024 } 1025 ArrayList<PendingAttribute> bigrams = null; 1026 if (0 != (flags & FLAG_HAS_BIGRAMS)) { 1027 bigrams = new ArrayList<PendingAttribute>(); 1028 while (true) { 1029 final int bigramFlags = source.readUnsignedByte(); 1030 ++addressPointer; 1031 final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; 1032 int bigramAddress = addressPointer; 1033 switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { 1034 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: 1035 bigramAddress += sign * source.readUnsignedByte(); 1036 addressPointer += 1; 1037 break; 1038 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: 1039 bigramAddress += sign * source.readUnsignedShort(); 1040 addressPointer += 2; 1041 break; 1042 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: 1043 final int offset = ((source.readUnsignedByte() << 16) 1044 + source.readUnsignedShort()); 1045 bigramAddress += sign * offset; 1046 addressPointer += 3; 1047 break; 1048 default: 1049 throw new RuntimeException("Has bigrams with no address"); 1050 } 1051 bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY, 1052 bigramAddress)); 1053 if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; 1054 } 1055 } 1056 return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, 1057 childrenAddress, shortcutTargets, bigrams); 1058 } 1059 1060 /** 1061 * Reads and returns the char group count out of a file and forwards the pointer. 1062 */ 1063 private static int readCharGroupCount(RandomAccessFile source) throws IOException { 1064 final int msb = source.readUnsignedByte(); 1065 if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) { 1066 return msb; 1067 } else { 1068 return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8) 1069 + source.readUnsignedByte(); 1070 } 1071 } 1072 1073 /** 1074 * Finds, as a string, the word at the address passed as an argument. 1075 * 1076 * @param source the file to read from. 1077 * @param headerSize the size of the header. 1078 * @param address the address to seek. 1079 * @return the word, as a string. 1080 * @throws IOException if the file can't be read. 1081 */ 1082 private static String getWordAtAddress(RandomAccessFile source, long headerSize, 1083 int address) throws IOException { 1084 final long originalPointer = source.getFilePointer(); 1085 source.seek(headerSize); 1086 final int count = readCharGroupCount(source); 1087 int groupOffset = getGroupCountSize(count); 1088 final StringBuilder builder = new StringBuilder(); 1089 String result = null; 1090 1091 CharGroupInfo last = null; 1092 for (int i = count - 1; i >= 0; --i) { 1093 CharGroupInfo info = readCharGroup(source, groupOffset); 1094 groupOffset = info.mEndAddress; 1095 if (info.mOriginalAddress == address) { 1096 builder.append(new String(info.mCharacters, 0, info.mCharacters.length)); 1097 result = builder.toString(); 1098 break; // and return 1099 } 1100 if (hasChildrenAddress(info.mChildrenAddress)) { 1101 if (info.mChildrenAddress > address) { 1102 if (null == last) continue; 1103 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 1104 source.seek(last.mChildrenAddress + headerSize); 1105 groupOffset = last.mChildrenAddress + 1; 1106 i = source.readUnsignedByte(); 1107 last = null; 1108 continue; 1109 } 1110 last = info; 1111 } 1112 if (0 == i && hasChildrenAddress(last.mChildrenAddress)) { 1113 builder.append(new String(last.mCharacters, 0, last.mCharacters.length)); 1114 source.seek(last.mChildrenAddress + headerSize); 1115 groupOffset = last.mChildrenAddress + 1; 1116 i = source.readUnsignedByte(); 1117 last = null; 1118 continue; 1119 } 1120 } 1121 source.seek(originalPointer); 1122 return result; 1123 } 1124 1125 /** 1126 * Reads a single node from a binary file. 1127 * 1128 * This methods reads the file at the current position of its file pointer. A node is 1129 * fully expected to start at the current position. 1130 * This will recursively read other nodes into the structure, populating the reverse 1131 * maps on the fly and using them to keep track of already read nodes. 1132 * 1133 * @param source the data file, correctly positioned at the start of a node. 1134 * @param headerSize the size, in bytes, of the file header. 1135 * @param reverseNodeMap a mapping from addresses to already read nodes. 1136 * @param reverseGroupMap a mapping from addresses to already read character groups. 1137 * @return the read node with all his children already read. 1138 */ 1139 private static Node readNode(RandomAccessFile source, long headerSize, 1140 Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap) 1141 throws IOException { 1142 final int nodeOrigin = (int)(source.getFilePointer() - headerSize); 1143 final int count = readCharGroupCount(source); 1144 final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>(); 1145 int groupOffset = nodeOrigin + getGroupCountSize(count); 1146 for (int i = count; i > 0; --i) { 1147 CharGroupInfo info = readCharGroup(source, groupOffset); 1148 ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets; 1149 ArrayList<WeightedString> bigrams = null; 1150 if (null != info.mBigrams) { 1151 bigrams = new ArrayList<WeightedString>(); 1152 for (PendingAttribute bigram : info.mBigrams) { 1153 final String word = getWordAtAddress(source, headerSize, bigram.mAddress); 1154 bigrams.add(new WeightedString(word, bigram.mFrequency)); 1155 } 1156 } 1157 if (hasChildrenAddress(info.mChildrenAddress)) { 1158 Node children = reverseNodeMap.get(info.mChildrenAddress); 1159 if (null == children) { 1160 final long currentPosition = source.getFilePointer(); 1161 source.seek(info.mChildrenAddress + headerSize); 1162 children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap); 1163 source.seek(currentPosition); 1164 } 1165 nodeContents.add( 1166 new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, 1167 children, false)); 1168 } else { 1169 nodeContents.add( 1170 new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, 1171 false)); 1172 } 1173 groupOffset = info.mEndAddress; 1174 } 1175 final Node node = new Node(nodeContents); 1176 node.mCachedAddress = nodeOrigin; 1177 reverseNodeMap.put(node.mCachedAddress, node); 1178 return node; 1179 } 1180 1181 /** 1182 * Helper function to get the binary format version from the header. 1183 */ 1184 private static int getFormatVersion(final RandomAccessFile source) throws IOException { 1185 final int magic_v1 = source.readUnsignedShort(); 1186 if (VERSION_1_MAGIC_NUMBER == magic_v1) return source.readUnsignedByte(); 1187 final int magic_v2 = (magic_v1 << 16) + source.readUnsignedShort(); 1188 if (VERSION_2_MAGIC_NUMBER == magic_v2) return source.readUnsignedShort(); 1189 return NOT_A_VERSION_NUMBER; 1190 } 1191 1192 /** 1193 * Reads a random access file and returns the memory representation of the dictionary. 1194 * 1195 * This high-level method takes a binary file and reads its contents, populating a 1196 * FusionDictionary structure. The optional dict argument is an existing dictionary to 1197 * which words from the file should be added. If it is null, a new dictionary is created. 1198 * 1199 * @param source the file to read. 1200 * @param dict an optional dictionary to add words to, or null. 1201 * @return the created (or merged) dictionary. 1202 */ 1203 public static FusionDictionary readDictionaryBinary(final RandomAccessFile source, 1204 final FusionDictionary dict) throws IOException, UnsupportedFormatException { 1205 // Check file version 1206 final int version = getFormatVersion(source); 1207 if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION ) { 1208 throw new UnsupportedFormatException("This file has version " + version 1209 + ", but this implementation does not support versions above " 1210 + MAXIMUM_SUPPORTED_VERSION); 1211 } 1212 1213 // Read options 1214 source.readUnsignedShort(); 1215 1216 final long headerSize; 1217 if (version < FIRST_VERSION_WITH_HEADER_SIZE) { 1218 headerSize = source.getFilePointer(); 1219 } else { 1220 headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16) 1221 + (source.readUnsignedByte() << 8) + source.readUnsignedByte(); 1222 // read the header body 1223 source.seek(headerSize); 1224 } 1225 1226 Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>(); 1227 Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>(); 1228 final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping); 1229 1230 FusionDictionary newDict = new FusionDictionary(root, 1231 new FusionDictionary.DictionaryOptions()); 1232 if (null != dict) { 1233 for (Word w : dict) { 1234 newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); 1235 } 1236 } 1237 1238 return newDict; 1239 } 1240 1241 /** 1242 * Basic test to find out whether the file is a binary dictionary or not. 1243 * 1244 * Concretely this only tests the magic number. 1245 * 1246 * @param filename The name of the file to test. 1247 * @return true if it's a binary dictionary, false otherwise 1248 */ 1249 public static boolean isBinaryDictionary(final String filename) { 1250 try { 1251 RandomAccessFile f = new RandomAccessFile(filename, "r"); 1252 final int version = getFormatVersion(f); 1253 return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION); 1254 } catch (FileNotFoundException e) { 1255 return false; 1256 } catch (IOException e) { 1257 return false; 1258 } 1259 } 1260} 1261