1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.inputmethod.latin.makedict; 18 19import com.android.inputmethod.annotations.UsedForTesting; 20import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; 21import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; 22import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 23import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; 24import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; 25 26import java.io.ByteArrayOutputStream; 27import java.io.IOException; 28import java.io.OutputStream; 29import java.util.ArrayList; 30 31/** 32 * Encodes binary files for a FusionDictionary. 33 * 34 * All the methods in this class are static. 35 * 36 * TODO: Rename this class to DictEncoderUtils. 37 */ 38public class BinaryDictEncoderUtils { 39 40 private static final boolean DBG = MakedictLog.DBG; 41 42 private BinaryDictEncoderUtils() { 43 // This utility class is not publicly instantiable. 44 } 45 46 // Arbitrary limit to how much passes we consider address size compression should 47 // terminate in. At the time of this writing, our largest dictionary completes 48 // compression in five passes. 49 // If the number of passes exceeds this number, makedict bails with an exception on 50 // suspicion that a bug might be causing an infinite loop. 51 private static final int MAX_PASSES = 24; 52 53 /** 54 * Compute the binary size of the character array. 55 * 56 * If only one character, this is the size of this character. If many, it's the sum of their 57 * sizes + 1 byte for the terminator. 58 * 59 * @param characters the character array 60 * @return the size of the char array, including the terminator if any 61 */ 62 static int getPtNodeCharactersSize(final int[] characters) { 63 int size = CharEncoding.getCharArraySize(characters); 64 if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE; 65 return size; 66 } 67 68 /** 69 * Compute the binary size of the character array in a PtNode 70 * 71 * If only one character, this is the size of this character. If many, it's the sum of their 72 * sizes + 1 byte for the terminator. 73 * 74 * @param ptNode the PtNode 75 * @return the size of the char array, including the terminator if any 76 */ 77 private static int getPtNodeCharactersSize(final PtNode ptNode) { 78 return getPtNodeCharactersSize(ptNode.mChars); 79 } 80 81 /** 82 * Compute the binary size of the PtNode count for a node array. 83 * @param nodeArray the nodeArray 84 * @return the size of the PtNode count, either 1 or 2 bytes. 85 */ 86 private static int getPtNodeCountSize(final PtNodeArray nodeArray) { 87 return BinaryDictIOUtils.getPtNodeCountSize(nodeArray.mData.size()); 88 } 89 90 /** 91 * Compute the size of a shortcut in bytes. 92 */ 93 private static int getShortcutSize(final WeightedString shortcut) { 94 int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; 95 final String word = shortcut.mWord; 96 final int length = word.length(); 97 for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { 98 final int codePoint = word.codePointAt(i); 99 size += CharEncoding.getCharSize(codePoint); 100 } 101 size += FormatSpec.PTNODE_TERMINATOR_SIZE; 102 return size; 103 } 104 105 /** 106 * Compute the size of a shortcut list in bytes. 107 * 108 * This is known in advance and does not change according to position in the file 109 * like address lists do. 110 */ 111 static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) { 112 if (null == shortcutList || shortcutList.isEmpty()) return 0; 113 int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE; 114 for (final WeightedString shortcut : shortcutList) { 115 size += getShortcutSize(shortcut); 116 } 117 return size; 118 } 119 120 /** 121 * Compute the maximum size of a PtNode, assuming 3-byte addresses for everything. 122 * 123 * @param ptNode the PtNode to compute the size of. 124 * @return the maximum size of the PtNode. 125 */ 126 private static int getPtNodeMaximumSize(final PtNode ptNode) { 127 int size = getNodeHeaderSize(ptNode); 128 if (ptNode.isTerminal()) { 129 // If terminal, one byte for the frequency. 130 size += FormatSpec.PTNODE_FREQUENCY_SIZE; 131 } 132 size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address 133 size += getShortcutListSize(ptNode.mShortcutTargets); 134 if (null != ptNode.mBigrams) { 135 size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE 136 + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE) 137 * ptNode.mBigrams.size(); 138 } 139 return size; 140 } 141 142 /** 143 * Compute the maximum size of each PtNode of a PtNode array, assuming 3-byte addresses for 144 * everything, and caches it in the `mCachedSize' member of the nodes; deduce the size of 145 * the containing node array, and cache it it its 'mCachedSize' member. 146 * 147 * @param ptNodeArray the node array to compute the maximum size of. 148 */ 149 private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray) { 150 int size = getPtNodeCountSize(ptNodeArray); 151 for (PtNode node : ptNodeArray.mData) { 152 final int nodeSize = getPtNodeMaximumSize(node); 153 node.mCachedSize = nodeSize; 154 size += nodeSize; 155 } 156 ptNodeArray.mCachedSize = size; 157 } 158 159 /** 160 * Compute the size of the header (flag + [parent address] + characters size) of a PtNode. 161 * 162 * @param ptNode the PtNode of which to compute the size of the header 163 */ 164 private static int getNodeHeaderSize(final PtNode ptNode) { 165 return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode); 166 } 167 168 /** 169 * Compute the size, in bytes, that an address will occupy. 170 * 171 * This can be used either for children addresses (which are always positive) or for 172 * attribute, which may be positive or negative but 173 * store their sign bit separately. 174 * 175 * @param address the address 176 * @return the byte size. 177 */ 178 static int getByteSize(final int address) { 179 assert(address <= FormatSpec.UINT24_MAX); 180 if (!BinaryDictIOUtils.hasChildrenAddress(address)) { 181 return 0; 182 } else if (Math.abs(address) <= FormatSpec.UINT8_MAX) { 183 return 1; 184 } else if (Math.abs(address) <= FormatSpec.UINT16_MAX) { 185 return 2; 186 } else { 187 return 3; 188 } 189 } 190 191 static int writeUIntToBuffer(final byte[] buffer, int position, final int value, 192 final int size) { 193 switch(size) { 194 case 4: 195 buffer[position++] = (byte) ((value >> 24) & 0xFF); 196 /* fall through */ 197 case 3: 198 buffer[position++] = (byte) ((value >> 16) & 0xFF); 199 /* fall through */ 200 case 2: 201 buffer[position++] = (byte) ((value >> 8) & 0xFF); 202 /* fall through */ 203 case 1: 204 buffer[position++] = (byte) (value & 0xFF); 205 break; 206 default: 207 /* nop */ 208 } 209 return position; 210 } 211 212 static void writeUIntToStream(final OutputStream stream, final int value, final int size) 213 throws IOException { 214 switch(size) { 215 case 4: 216 stream.write((value >> 24) & 0xFF); 217 /* fall through */ 218 case 3: 219 stream.write((value >> 16) & 0xFF); 220 /* fall through */ 221 case 2: 222 stream.write((value >> 8) & 0xFF); 223 /* fall through */ 224 case 1: 225 stream.write(value & 0xFF); 226 break; 227 default: 228 /* nop */ 229 } 230 } 231 232 @UsedForTesting 233 static void writeUIntToDictBuffer(final DictBuffer dictBuffer, final int value, 234 final int size) { 235 switch(size) { 236 case 4: 237 dictBuffer.put((byte) ((value >> 24) & 0xFF)); 238 /* fall through */ 239 case 3: 240 dictBuffer.put((byte) ((value >> 16) & 0xFF)); 241 /* fall through */ 242 case 2: 243 dictBuffer.put((byte) ((value >> 8) & 0xFF)); 244 /* fall through */ 245 case 1: 246 dictBuffer.put((byte) (value & 0xFF)); 247 break; 248 default: 249 /* nop */ 250 } 251 } 252 253 // End utility methods 254 255 // This method is responsible for finding a nice ordering of the nodes that favors run-time 256 // cache performance and dictionary size. 257 /* package for tests */ static ArrayList<PtNodeArray> flattenTree( 258 final PtNodeArray rootNodeArray) { 259 final int treeSize = FusionDictionary.countPtNodes(rootNodeArray); 260 MakedictLog.i("Counted nodes : " + treeSize); 261 final ArrayList<PtNodeArray> flatTree = new ArrayList<>(treeSize); 262 return flattenTreeInner(flatTree, rootNodeArray); 263 } 264 265 private static ArrayList<PtNodeArray> flattenTreeInner(final ArrayList<PtNodeArray> list, 266 final PtNodeArray ptNodeArray) { 267 // Removing the node is necessary if the tails are merged, because we would then 268 // add the same node several times when we only want it once. A number of places in 269 // the code also depends on any node being only once in the list. 270 // Merging tails can only be done if there are no attributes. Searching for attributes 271 // in LatinIME code depends on a total breadth-first ordering, which merging tails 272 // breaks. If there are no attributes, it should be fine (and reduce the file size) 273 // to merge tails, and removing the node from the list would be necessary. However, 274 // we don't merge tails because breaking the breadth-first ordering would result in 275 // extreme overhead at bigram lookup time (it would make the search function O(n) instead 276 // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty 277 // high). 278 // If no nodes are ever merged, we can't have the same node twice in the list, hence 279 // searching for duplicates in unnecessary. It is also very performance consuming, 280 // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making 281 // this simple list.remove operation O(n*n) overall. On Android this overhead is very 282 // high. 283 // For future reference, the code to remove duplicate is a simple : list.remove(node); 284 list.add(ptNodeArray); 285 final ArrayList<PtNode> branches = ptNodeArray.mData; 286 for (PtNode ptNode : branches) { 287 if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren); 288 } 289 return list; 290 } 291 292 /** 293 * Get the offset from a position inside a current node array to a target node array, during 294 * update. 295 * 296 * If the current node array is before the target node array, the target node array has not 297 * been updated yet, so we should return the offset from the old position of the current node 298 * array to the old position of the target node array. If on the other hand the target is 299 * before the current node array, it already has been updated, so we should return the offset 300 * from the new position in the current node array to the new position in the target node 301 * array. 302 * 303 * @param currentNodeArray node array containing the PtNode where the offset will be written 304 * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray 305 * @param targetNodeArray the target node array to get the offset to 306 * @return the offset to the target node array 307 */ 308 private static int getOffsetToTargetNodeArrayDuringUpdate(final PtNodeArray currentNodeArray, 309 final int offsetFromStartOfCurrentNodeArray, final PtNodeArray targetNodeArray) { 310 final boolean isTargetBeforeCurrent = (targetNodeArray.mCachedAddressBeforeUpdate 311 < currentNodeArray.mCachedAddressBeforeUpdate); 312 if (isTargetBeforeCurrent) { 313 return targetNodeArray.mCachedAddressAfterUpdate 314 - (currentNodeArray.mCachedAddressAfterUpdate 315 + offsetFromStartOfCurrentNodeArray); 316 } else { 317 return targetNodeArray.mCachedAddressBeforeUpdate 318 - (currentNodeArray.mCachedAddressBeforeUpdate 319 + offsetFromStartOfCurrentNodeArray); 320 } 321 } 322 323 /** 324 * Get the offset from a position inside a current node array to a target PtNode, during 325 * update. 326 * 327 * @param currentNodeArray node array containing the PtNode where the offset will be written 328 * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray 329 * @param targetPtNode the target PtNode to get the offset to 330 * @return the offset to the target PtNode 331 */ 332 // TODO: is there any way to factorize this method with the one above? 333 private static int getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray, 334 final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode) { 335 final int oldOffsetBasePoint = currentNodeArray.mCachedAddressBeforeUpdate 336 + offsetFromStartOfCurrentNodeArray; 337 final boolean isTargetBeforeCurrent = (targetPtNode.mCachedAddressBeforeUpdate 338 < oldOffsetBasePoint); 339 // If the target is before the current node array, then its address has already been 340 // updated. We can use the AfterUpdate member, and compare it to our own member after 341 // update. Otherwise, the AfterUpdate member is not updated yet, so we need to use the 342 // BeforeUpdate member, and of course we have to compare this to our own address before 343 // update. 344 if (isTargetBeforeCurrent) { 345 final int newOffsetBasePoint = currentNodeArray.mCachedAddressAfterUpdate 346 + offsetFromStartOfCurrentNodeArray; 347 return targetPtNode.mCachedAddressAfterUpdate - newOffsetBasePoint; 348 } else { 349 return targetPtNode.mCachedAddressBeforeUpdate - oldOffsetBasePoint; 350 } 351 } 352 353 /** 354 * Computes the actual node array size, based on the cached addresses of the children nodes. 355 * 356 * Each node array stores its tentative address. During dictionary address computing, these 357 * are not final, but they can be used to compute the node array size (the node array size 358 * depends on the address of the children because the number of bytes necessary to store an 359 * address depends on its numeric value. The return value indicates whether the node array 360 * contents (as in, any of the addresses stored in the cache fields) have changed with 361 * respect to their previous value. 362 * 363 * @param ptNodeArray the node array to compute the size of. 364 * @param dict the dictionary in which the word/attributes are to be found. 365 * @return false if none of the cached addresses inside the node array changed, true otherwise. 366 */ 367 private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, 368 final FusionDictionary dict) { 369 boolean changed = false; 370 int size = getPtNodeCountSize(ptNodeArray); 371 for (PtNode ptNode : ptNodeArray.mData) { 372 ptNode.mCachedAddressAfterUpdate = ptNodeArray.mCachedAddressAfterUpdate + size; 373 if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) { 374 changed = true; 375 } 376 int nodeSize = getNodeHeaderSize(ptNode); 377 if (ptNode.isTerminal()) { 378 nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE; 379 } 380 if (null != ptNode.mChildren) { 381 nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray, 382 nodeSize + size, ptNode.mChildren)); 383 } 384 nodeSize += getShortcutListSize(ptNode.mShortcutTargets); 385 if (null != ptNode.mBigrams) { 386 for (WeightedString bigram : ptNode.mBigrams) { 387 final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray, 388 nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE, 389 FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord)); 390 nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE; 391 } 392 } 393 ptNode.mCachedSize = nodeSize; 394 size += nodeSize; 395 } 396 if (ptNodeArray.mCachedSize != size) { 397 ptNodeArray.mCachedSize = size; 398 changed = true; 399 } 400 return changed; 401 } 402 403 /** 404 * Initializes the cached addresses of node arrays and their containing nodes from their size. 405 * 406 * @param flatNodes the list of node arrays. 407 * @return the byte size of the entire stack. 408 */ 409 private static int initializePtNodeArraysCachedAddresses( 410 final ArrayList<PtNodeArray> flatNodes) { 411 int nodeArrayOffset = 0; 412 for (final PtNodeArray nodeArray : flatNodes) { 413 nodeArray.mCachedAddressBeforeUpdate = nodeArrayOffset; 414 int nodeCountSize = getPtNodeCountSize(nodeArray); 415 int nodeffset = 0; 416 for (final PtNode ptNode : nodeArray.mData) { 417 ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate = 418 nodeCountSize + nodeArrayOffset + nodeffset; 419 nodeffset += ptNode.mCachedSize; 420 } 421 nodeArrayOffset += nodeArray.mCachedSize; 422 } 423 return nodeArrayOffset; 424 } 425 426 /** 427 * Updates the cached addresses of node arrays after recomputing their new positions. 428 * 429 * @param flatNodes the list of node arrays. 430 */ 431 private static void updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) { 432 for (final PtNodeArray nodeArray : flatNodes) { 433 nodeArray.mCachedAddressBeforeUpdate = nodeArray.mCachedAddressAfterUpdate; 434 for (final PtNode ptNode : nodeArray.mData) { 435 ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate; 436 } 437 } 438 } 439 440 /** 441 * Compute the addresses and sizes of an ordered list of PtNode arrays. 442 * 443 * This method takes a list of PtNode arrays and will update their cached address and size 444 * values so that they can be written into a file. It determines the smallest size each of the 445 * PtNode arrays can be given the addresses of its children and attributes, and store that into 446 * each PtNode. 447 * The order of the PtNode is given by the order of the array. This method makes no effort 448 * to find a good order; it only mechanically computes the size this order results in. 449 * 450 * @param dict the dictionary 451 * @param flatNodes the ordered list of PtNode arrays 452 * @return the same array it was passed. The nodes have been updated for address and size. 453 */ 454 /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict, 455 final ArrayList<PtNodeArray> flatNodes) { 456 // First get the worst possible sizes and offsets 457 for (final PtNodeArray n : flatNodes) { 458 calculatePtNodeArrayMaximumSize(n); 459 } 460 final int offset = initializePtNodeArraysCachedAddresses(flatNodes); 461 462 MakedictLog.i("Compressing the array addresses. Original size : " + offset); 463 MakedictLog.i("(Recursively seen size : " + offset + ")"); 464 465 int passes = 0; 466 boolean changesDone = false; 467 do { 468 changesDone = false; 469 int ptNodeArrayStartOffset = 0; 470 for (final PtNodeArray ptNodeArray : flatNodes) { 471 ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset; 472 final int oldNodeArraySize = ptNodeArray.mCachedSize; 473 final boolean changed = computeActualPtNodeArraySize(ptNodeArray, dict); 474 final int newNodeArraySize = ptNodeArray.mCachedSize; 475 if (oldNodeArraySize < newNodeArraySize) { 476 throw new RuntimeException("Increased size ?!"); 477 } 478 ptNodeArrayStartOffset += newNodeArraySize; 479 changesDone |= changed; 480 } 481 updatePtNodeArraysCachedAddresses(flatNodes); 482 ++passes; 483 if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug"); 484 } while (changesDone); 485 486 final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1); 487 MakedictLog.i("Compression complete in " + passes + " passes."); 488 MakedictLog.i("After address compression : " 489 + (lastPtNodeArray.mCachedAddressAfterUpdate + lastPtNodeArray.mCachedSize)); 490 491 return flatNodes; 492 } 493 494 /** 495 * Sanity-checking method. 496 * 497 * This method checks a list of PtNode arrays for juxtaposition, that is, it will do 498 * nothing if each node array's cached address is actually the previous node array's address 499 * plus the previous node's size. 500 * If this is not the case, it will throw an exception. 501 * 502 * @param arrays the list of node arrays to check 503 */ 504 /* package */ static void checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays) { 505 int offset = 0; 506 int index = 0; 507 for (final PtNodeArray ptNodeArray : arrays) { 508 // BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter 509 // which we use. 510 if (ptNodeArray.mCachedAddressAfterUpdate != offset) { 511 throw new RuntimeException("Wrong address for node " + index 512 + " : expected " + offset + ", got " + 513 ptNodeArray.mCachedAddressAfterUpdate); 514 } 515 ++index; 516 offset += ptNodeArray.mCachedSize; 517 } 518 } 519 520 /** 521 * Helper method to write a children position to a file. 522 * 523 * @param buffer the buffer to write to. 524 * @param index the index in the buffer to write the address to. 525 * @param position the position to write. 526 * @return the size in bytes the address actually took. 527 */ 528 /* package */ static int writeChildrenPosition(final byte[] buffer, int index, 529 final int position) { 530 switch (getByteSize(position)) { 531 case 1: 532 buffer[index++] = (byte)position; 533 return 1; 534 case 2: 535 buffer[index++] = (byte)(0xFF & (position >> 8)); 536 buffer[index++] = (byte)(0xFF & position); 537 return 2; 538 case 3: 539 buffer[index++] = (byte)(0xFF & (position >> 16)); 540 buffer[index++] = (byte)(0xFF & (position >> 8)); 541 buffer[index++] = (byte)(0xFF & position); 542 return 3; 543 case 0: 544 return 0; 545 default: 546 throw new RuntimeException("Position " + position + " has a strange size"); 547 } 548 } 549 550 /** 551 * Helper method to write a signed children position to a file. 552 * 553 * @param buffer the buffer to write to. 554 * @param index the index in the buffer to write the address to. 555 * @param position the position to write. 556 * @return the size in bytes the address actually took. 557 */ 558 /* package */ static int writeSignedChildrenPosition(final byte[] buffer, int index, 559 final int position) { 560 if (!BinaryDictIOUtils.hasChildrenAddress(position)) { 561 buffer[index] = buffer[index + 1] = buffer[index + 2] = 0; 562 } else { 563 final int absPosition = Math.abs(position); 564 buffer[index++] = 565 (byte)((position < 0 ? FormatSpec.MSB8 : 0) | (0xFF & (absPosition >> 16))); 566 buffer[index++] = (byte)(0xFF & (absPosition >> 8)); 567 buffer[index++] = (byte)(0xFF & absPosition); 568 } 569 return 3; 570 } 571 572 /** 573 * Makes the flag value for a PtNode. 574 * 575 * @param hasMultipleChars whether the PtNode has multiple chars. 576 * @param isTerminal whether the PtNode is terminal. 577 * @param childrenAddressSize the size of a children address. 578 * @param hasShortcuts whether the PtNode has shortcuts. 579 * @param hasBigrams whether the PtNode has bigrams. 580 * @param isNotAWord whether the PtNode is not a word. 581 * @param isBlackListEntry whether the PtNode is a blacklist entry. 582 * @return the flags 583 */ 584 static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal, 585 final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams, 586 final boolean isNotAWord, final boolean isBlackListEntry) { 587 byte flags = 0; 588 if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS; 589 if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL; 590 switch (childrenAddressSize) { 591 case 1: 592 flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE; 593 break; 594 case 2: 595 flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES; 596 break; 597 case 3: 598 flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES; 599 break; 600 case 0: 601 flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS; 602 break; 603 default: 604 throw new RuntimeException("Node with a strange address"); 605 } 606 if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS; 607 if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS; 608 if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD; 609 if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED; 610 return flags; 611 } 612 613 /* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset) { 614 return (byte) makePtNodeFlags(node.mChars.length > 1, node.isTerminal(), 615 getByteSize(childrenOffset), 616 node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(), 617 node.mBigrams != null && !node.mBigrams.isEmpty(), 618 node.mIsNotAWord, node.mIsBlacklistEntry); 619 } 620 621 /** 622 * Makes the flag value for a bigram. 623 * 624 * @param more whether there are more bigrams after this one. 625 * @param offset the offset of the bigram. 626 * @param bigramFrequency the frequency of the bigram, 0..255. 627 * @param unigramFrequency the unigram frequency of the same word, 0..255. 628 * @param word the second bigram, for debugging purposes 629 * @return the flags 630 */ 631 /* package */ static final int makeBigramFlags(final boolean more, final int offset, 632 int bigramFrequency, final int unigramFrequency, final String word) { 633 int bigramFlags = (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0) 634 + (offset < 0 ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0); 635 switch (getByteSize(offset)) { 636 case 1: 637 bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE; 638 break; 639 case 2: 640 bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES; 641 break; 642 case 3: 643 bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES; 644 break; 645 default: 646 throw new RuntimeException("Strange offset size"); 647 } 648 if (unigramFrequency > bigramFrequency) { 649 MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word 650 + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " 651 + word + " is " + unigramFrequency); 652 bigramFrequency = unigramFrequency; 653 } 654 bigramFlags += getBigramFrequencyDiff(unigramFrequency, bigramFrequency) 655 & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY; 656 return bigramFlags; 657 } 658 659 public static int getBigramFrequencyDiff(final int unigramFrequency, 660 final int bigramFrequency) { 661 // We compute the difference between 255 (which means probability = 1) and the 662 // unigram score. We split this into a number of discrete steps. 663 // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15 664 // represents an increase of 16 steps: a value of 15 will be interpreted as the median 665 // value of the 16th step. In all justice, if the bigram frequency is low enough to be 666 // rounded below the first step (which means it is less than half a step higher than the 667 // unigram frequency) then the unigram frequency itself is the best approximation of the 668 // bigram freq that we could possibly supply, hence we should *not* include this bigram 669 // in the file at all. 670 // until this is done, we'll write 0 and slightly overestimate this case. 671 // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step 672 // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to 673 // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the 674 // step size. Then we compute the start of the first step (the one where value 0 starts) 675 // by adding half-a-step to the unigramFrequency. From there, we compute the integer 676 // number of steps to the bigramFrequency. One last thing: we want our steps to include 677 // their lower bound and exclude their higher bound so we need to have the first step 678 // start at exactly 1 unit higher than floor(unigramFreq + half a step). 679 // Note : to reconstruct the score, the dictionary reader will need to divide 680 // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step, 681 // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best 682 // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the 683 // step pointed by the discretized frequency. 684 final float stepSize = 685 (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) 686 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); 687 final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f); 688 final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize); 689 // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1 690 // here. The best approximation would be the unigram freq itself, so we should not 691 // include this bigram in the dictionary. For now, register as 0, and live with the 692 // small over-estimation that we get in this case. TODO: actually remove this bigram 693 // if discretizedFrequency < 0. 694 return discretizedFrequency > 0 ? discretizedFrequency : 0; 695 } 696 697 /** 698 * Makes the flag value for a shortcut. 699 * 700 * @param more whether there are more attributes after this one. 701 * @param frequency the frequency of the attribute, 0..15 702 * @return the flags 703 */ 704 static final int makeShortcutFlags(final boolean more, final int frequency) { 705 return (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0) 706 + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY); 707 } 708 709 /* package */ static final int getChildrenPosition(final PtNode ptNode) { 710 int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate 711 + getNodeHeaderSize(ptNode); 712 if (ptNode.isTerminal()) { 713 // A terminal node has the frequency. 714 // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children 715 // position. 716 positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE; 717 } 718 return null == ptNode.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS 719 : ptNode.mChildren.mCachedAddressAfterUpdate - positionOfChildrenPosField; 720 } 721 722 /** 723 * Write a PtNodeArray. The PtNodeArray is expected to have its final position cached. 724 * 725 * @param dict the dictionary the node array is a part of (for relative offsets). 726 * @param dictEncoder the dictionary encoder. 727 * @param ptNodeArray the node array to write. 728 */ 729 @SuppressWarnings("unused") 730 /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict, 731 final DictEncoder dictEncoder, final PtNodeArray ptNodeArray) { 732 // TODO: Make the code in common with BinaryDictIOUtils#writePtNode 733 dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate); 734 735 final int ptNodeCount = ptNodeArray.mData.size(); 736 dictEncoder.writePtNodeCount(ptNodeCount); 737 final int parentPosition = 738 (ptNodeArray.mCachedParentAddress == FormatSpec.NO_PARENT_ADDRESS) 739 ? FormatSpec.NO_PARENT_ADDRESS 740 : ptNodeArray.mCachedParentAddress + ptNodeArray.mCachedAddressAfterUpdate; 741 for (int i = 0; i < ptNodeCount; ++i) { 742 final PtNode ptNode = ptNodeArray.mData.get(i); 743 if (dictEncoder.getPosition() != ptNode.mCachedAddressAfterUpdate) { 744 throw new RuntimeException("Bug: write index is not the same as the cached address " 745 + "of the node : " + dictEncoder.getPosition() + " <> " 746 + ptNode.mCachedAddressAfterUpdate); 747 } 748 // Sanity checks. 749 if (DBG && ptNode.getProbability() > FormatSpec.MAX_TERMINAL_FREQUENCY) { 750 throw new RuntimeException("A node has a frequency > " 751 + FormatSpec.MAX_TERMINAL_FREQUENCY 752 + " : " + ptNode.mProbabilityInfo.toString()); 753 } 754 dictEncoder.writePtNode(ptNode, dict); 755 } 756 if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate 757 + ptNodeArray.mCachedSize) { 758 throw new RuntimeException("Not the same size : written " 759 + (dictEncoder.getPosition() - ptNodeArray.mCachedAddressAfterUpdate) 760 + " bytes from a node that should have " + ptNodeArray.mCachedSize + " bytes"); 761 } 762 } 763 764 /** 765 * Dumps a collection of useful statistics about a list of PtNode arrays. 766 * 767 * This prints purely informative stuff, like the total estimated file size, the 768 * number of PtNode arrays, of PtNodes, the repartition of each address size, etc 769 * 770 * @param ptNodeArrays the list of PtNode arrays. 771 */ 772 /* package */ static void showStatistics(ArrayList<PtNodeArray> ptNodeArrays) { 773 int firstTerminalAddress = Integer.MAX_VALUE; 774 int lastTerminalAddress = Integer.MIN_VALUE; 775 int size = 0; 776 int ptNodes = 0; 777 int maxNodes = 0; 778 int maxRuns = 0; 779 for (final PtNodeArray ptNodeArray : ptNodeArrays) { 780 if (maxNodes < ptNodeArray.mData.size()) maxNodes = ptNodeArray.mData.size(); 781 for (final PtNode ptNode : ptNodeArray.mData) { 782 ++ptNodes; 783 if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length; 784 if (ptNode.isTerminal()) { 785 if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress) 786 firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; 787 if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress) 788 lastTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate; 789 } 790 } 791 if (ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize > size) { 792 size = ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize; 793 } 794 } 795 final int[] ptNodeCounts = new int[maxNodes + 1]; 796 final int[] runCounts = new int[maxRuns + 1]; 797 for (final PtNodeArray ptNodeArray : ptNodeArrays) { 798 ++ptNodeCounts[ptNodeArray.mData.size()]; 799 for (final PtNode ptNode : ptNodeArray.mData) { 800 ++runCounts[ptNode.mChars.length]; 801 } 802 } 803 804 MakedictLog.i("Statistics:\n" 805 + " Total file size " + size + "\n" 806 + " " + ptNodeArrays.size() + " node arrays\n" 807 + " " + ptNodes + " PtNodes (" + ((float)ptNodes / ptNodeArrays.size()) 808 + " PtNodes per node)\n" 809 + " First terminal at " + firstTerminalAddress + "\n" 810 + " Last terminal at " + lastTerminalAddress + "\n" 811 + " PtNode stats : max = " + maxNodes); 812 } 813 814 /** 815 * Writes a file header to an output stream. 816 * 817 * @param destination the stream to write the file header to. 818 * @param dict the dictionary to write. 819 * @param formatOptions file format options. 820 * @return the size of the header. 821 */ 822 /* package */ static int writeDictionaryHeader(final OutputStream destination, 823 final FusionDictionary dict, final FormatOptions formatOptions) 824 throws IOException, UnsupportedFormatException { 825 final int version = formatOptions.mVersion; 826 if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION 827 || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) { 828 throw new UnsupportedFormatException("Requested file format version " + version 829 + ", but this implementation only supports versions " 830 + FormatSpec.MINIMUM_SUPPORTED_VERSION + " through " 831 + FormatSpec.MAXIMUM_SUPPORTED_VERSION); 832 } 833 834 ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256); 835 836 // The magic number in big-endian order. 837 // Magic number for all versions. 838 headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 24))); 839 headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 16))); 840 headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 8))); 841 headerBuffer.write((byte) (0xFF & FormatSpec.MAGIC_NUMBER)); 842 // Dictionary version. 843 headerBuffer.write((byte) (0xFF & (version >> 8))); 844 headerBuffer.write((byte) (0xFF & version)); 845 846 // Options flags 847 // TODO: Remove this field. 848 final int options = 0; 849 headerBuffer.write((byte) (0xFF & (options >> 8))); 850 headerBuffer.write((byte) (0xFF & options)); 851 final int headerSizeOffset = headerBuffer.size(); 852 // Placeholder to be written later with header size. 853 for (int i = 0; i < 4; ++i) { 854 headerBuffer.write(0); 855 } 856 // Write out the options. 857 for (final String key : dict.mOptions.mAttributes.keySet()) { 858 final String value = dict.mOptions.mAttributes.get(key); 859 CharEncoding.writeString(headerBuffer, key); 860 CharEncoding.writeString(headerBuffer, value); 861 } 862 final int size = headerBuffer.size(); 863 final byte[] bytes = headerBuffer.toByteArray(); 864 // Write out the header size. 865 bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24)); 866 bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16)); 867 bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8)); 868 bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0)); 869 destination.write(bytes); 870 871 headerBuffer.close(); 872 return size; 873 } 874} 875