BinaryDictInputOutput.java revision 752996540ff3a6dd5b48819849c06355c4270e03
1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17package com.android.inputmethod.latin.makedict;
18
19import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
20import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
21import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
22
23import java.io.FileNotFoundException;
24import java.io.IOException;
25import java.io.OutputStream;
26import java.io.RandomAccessFile;
27import java.util.ArrayList;
28import java.util.Arrays;
29import java.util.Iterator;
30import java.util.Map;
31import java.util.TreeMap;
32
33/**
34 * Reads and writes XML files for a FusionDictionary.
35 *
36 * All the methods in this class are static.
37 */
38public class BinaryDictInputOutput {
39
40    /* Node layout is as follows:
41     *   | addressType                         xx     : mask with MASK_GROUP_ADDRESS_TYPE
42     *                                 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
43     * f |                                     01 = 1 byte      : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
44     * l |                                     10 = 2 bytes     : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
45     * a |                                     11 = 3 bytes     : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
46     * g | has several chars ?         1 bit, 1 = yes, 0 = no   : FLAG_HAS_MULTIPLE_CHARS
47     * s | has a terminal ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_TERMINAL
48     *   | has shortcut targets ?      1 bit, 1 = yes, 0 = no   : FLAG_HAS_SHORTCUT_TARGETS
49     *   | has bigrams ?               1 bit, 1 = yes, 0 = no   : FLAG_HAS_BIGRAMS
50     *
51     * c | IF FLAG_HAS_MULTIPLE_CHARS
52     * h |   char, char, char, char    n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
53     * a |   end                       1 byte, = 0
54     * r | ELSE
55     * s |   char                      1 or 3 bytes
56     *   | END
57     *
58     * f |
59     * r | IF FLAG_IS_TERMINAL
60     * e |   frequency                 1 byte
61     * q |
62     *
63     * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType
64     * h |   // nothing
65     * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType
66     * l |   children address, 1 byte
67     * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType
68     * r |   children address, 2 bytes
69     * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType
70     * n |   children address, 3 bytes
71     * A | END
72     * d
73     * dress
74     *
75     *   | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
76     *   | shortcut string list
77     *   | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS
78     *   | bigrams address list
79     *
80     * Char format is:
81     * 1 byte = bbbbbbbb match
82     * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
83     * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
84     *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
85     *       00011111 would be outside unicode.
86     * else: iso-latin-1 code
87     * This allows for the whole unicode range to be encoded, including chars outside of
88     * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
89     * characters which should never happen anyway (and still work, but take 3 bytes).
90     *
91     * bigram address list is:
92     * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no     : FLAG_ATTRIBUTE_HAS_NEXT
93     *           | addressSign = 1 bit,                 : FLAG_ATTRIBUTE_OFFSET_NEGATIVE
94     *           |                      1 = must take -address, 0 = must take +address
95     *           |                         xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE
96     *           | addressFormat = 2 bits, 00 = unused  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
97     *           |                         01 = 1 byte  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
98     *           |                         10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES
99     *           |                         11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES
100     *           | 4 bits : frequency         : mask with FLAG_ATTRIBUTE_FREQUENCY
101     * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat)
102     *           |   read 1 byte, add top 4 bits
103     *           | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat)
104     *           |   read 2 bytes, add top 4 bits
105     *           | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat
106     *           |   read 3 bytes, add top 4 bits
107     *           | END
108     *           | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address
109     * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is
110     *
111     * shortcut string list is:
112     * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
113     * <flags>     = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT
114     *               | reserved = 3 bits, must be 0
115     *               | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY
116     * <shortcut>  = | string of characters at the char format described above, with the terminator
117     *               | used to signal the end of the string.
118     * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags
119     */
120
121    private static final int VERSION_1_MAGIC_NUMBER = 0x78B1;
122    private static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
123    private static final int MINIMUM_SUPPORTED_VERSION = 1;
124    private static final int MAXIMUM_SUPPORTED_VERSION = 2;
125    private static final int NOT_A_VERSION_NUMBER = -1;
126    private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2;
127
128    // No options yet, reserved for future use.
129    private static final int OPTIONS = 0;
130
131    // TODO: Make this value adaptative to content data, store it in the header, and
132    // use it in the reading code.
133    private static final int MAX_WORD_LENGTH = 48;
134
135    private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0;
136    private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
137    private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
138    private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
139    private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
140
141    private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
142
143    private static final int FLAG_IS_TERMINAL = 0x10;
144    private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
145    private static final int FLAG_HAS_BIGRAMS = 0x04;
146
147    private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
148    private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
149    private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
150    private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
151    private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
152    private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
153    private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F;
154
155    private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F;
156
157    private static final int GROUP_TERMINATOR_SIZE = 1;
158    private static final int GROUP_FLAGS_SIZE = 1;
159    private static final int GROUP_FREQUENCY_SIZE = 1;
160    private static final int GROUP_MAX_ADDRESS_SIZE = 3;
161    private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1;
162    private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
163    private static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2;
164
165    private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
166    private static final int INVALID_CHARACTER = -1;
167
168    private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127
169    private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
170
171    private static final int MAX_TERMINAL_FREQUENCY = 255;
172
173    /**
174     * A class grouping utility function for our specific character encoding.
175     */
176    private static class CharEncoding {
177
178        private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
179        private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
180
181        /**
182         * Helper method to find out whether this code fits on one byte
183         */
184        private static boolean fitsOnOneByte(int character) {
185            return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
186                    && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
187        }
188
189        /**
190         * Compute the size of a character given its character code.
191         *
192         * Char format is:
193         * 1 byte = bbbbbbbb match
194         * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
195         * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
196         *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
197         *       00011111 would be outside unicode.
198         * else: iso-latin-1 code
199         * This allows for the whole unicode range to be encoded, including chars outside of
200         * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
201         * characters which should never happen anyway (and still work, but take 3 bytes).
202         *
203         * @param character the character code.
204         * @return the size in binary encoded-form, either 1 or 3 bytes.
205         */
206        private static int getCharSize(int character) {
207            // See char encoding in FusionDictionary.java
208            if (fitsOnOneByte(character)) return 1;
209            if (INVALID_CHARACTER == character) return 1;
210            return 3;
211        }
212
213        /**
214         * Compute the byte size of a character array.
215         */
216        private static int getCharArraySize(final int[] chars) {
217            int size = 0;
218            for (int character : chars) size += getCharSize(character);
219            return size;
220        }
221
222        /**
223         * Writes a char array to a byte buffer.
224         *
225         * @param codePoints the code point array to write.
226         * @param buffer the byte buffer to write to.
227         * @param index the index in buffer to write the character array to.
228         * @return the index after the last character.
229         */
230        private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
231            for (int codePoint : codePoints) {
232                if (1 == getCharSize(codePoint)) {
233                    buffer[index++] = (byte)codePoint;
234                } else {
235                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
236                    buffer[index++] = (byte)(0xFF & (codePoint >> 8));
237                    buffer[index++] = (byte)(0xFF & codePoint);
238                }
239            }
240            return index;
241        }
242
243        /**
244         * Writes a string with our character format to a byte buffer.
245         *
246         * This will also write the terminator byte.
247         *
248         * @param buffer the byte buffer to write to.
249         * @param origin the offset to write from.
250         * @param word the string to write.
251         * @return the size written, in bytes.
252         */
253        private static int writeString(final byte[] buffer, final int origin,
254                final String word) {
255            final int length = word.length();
256            int index = origin;
257            for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
258                final int codePoint = word.codePointAt(i);
259                if (1 == getCharSize(codePoint)) {
260                    buffer[index++] = (byte)codePoint;
261                } else {
262                    buffer[index++] = (byte)(0xFF & (codePoint >> 16));
263                    buffer[index++] = (byte)(0xFF & (codePoint >> 8));
264                    buffer[index++] = (byte)(0xFF & codePoint);
265                }
266            }
267            buffer[index++] = GROUP_CHARACTERS_TERMINATOR;
268            return index - origin;
269        }
270
271        /**
272         * Reads a string from a RandomAccessFile. This is the converse of the above method.
273         */
274        private static String readString(final RandomAccessFile source) throws IOException {
275            final StringBuilder s = new StringBuilder();
276            int character = readChar(source);
277            while (character != INVALID_CHARACTER) {
278                s.appendCodePoint(character);
279                character = readChar(source);
280            }
281            return s.toString();
282        }
283
284        /**
285         * Reads a character from the file.
286         *
287         * This follows the character format documented earlier in this source file.
288         *
289         * @param source the file, positioned over an encoded character.
290         * @return the character code.
291         */
292        private static int readChar(RandomAccessFile source) throws IOException {
293            int character = source.readUnsignedByte();
294            if (!fitsOnOneByte(character)) {
295                if (GROUP_CHARACTERS_TERMINATOR == character)
296                    return INVALID_CHARACTER;
297                character <<= 16;
298                character += source.readUnsignedShort();
299            }
300            return character;
301        }
302    }
303
304    /**
305     * Compute the binary size of the character array in a group
306     *
307     * If only one character, this is the size of this character. If many, it's the sum of their
308     * sizes + 1 byte for the terminator.
309     *
310     * @param group the group
311     * @return the size of the char array, including the terminator if any
312     */
313    private static int getGroupCharactersSize(CharGroup group) {
314        int size = CharEncoding.getCharArraySize(group.mChars);
315        if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE;
316        return size;
317    }
318
319    /**
320     * Compute the binary size of the group count
321     * @param count the group count
322     * @return the size of the group count, either 1 or 2 bytes.
323     */
324    private static int getGroupCountSize(final int count) {
325        if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) {
326            return 1;
327        } else if (MAX_CHARGROUPS_IN_A_NODE >= count) {
328            return 2;
329        } else {
330            throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE
331                    + " groups in a node (found " + count +")");
332        }
333    }
334
335    /**
336     * Compute the binary size of the group count for a node
337     * @param node the node
338     * @return the size of the group count, either 1 or 2 bytes.
339     */
340    private static int getGroupCountSize(final Node node) {
341        return getGroupCountSize(node.mData.size());
342    }
343
344    /**
345     * Compute the size of a shortcut in bytes.
346     */
347    private static int getShortcutSize(final WeightedString shortcut) {
348        int size = GROUP_ATTRIBUTE_FLAGS_SIZE;
349        final String word = shortcut.mWord;
350        final int length = word.length();
351        for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
352            final int codePoint = word.codePointAt(i);
353            size += CharEncoding.getCharSize(codePoint);
354        }
355        size += GROUP_TERMINATOR_SIZE;
356        return size;
357    }
358
359    /**
360     * Compute the size of a shortcut list in bytes.
361     *
362     * This is known in advance and does not change according to position in the file
363     * like address lists do.
364     */
365    private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
366        if (null == shortcutList) return 0;
367        int size = GROUP_SHORTCUT_LIST_SIZE_SIZE;
368        for (final WeightedString shortcut : shortcutList) {
369            size += getShortcutSize(shortcut);
370        }
371        return size;
372    }
373
374    /**
375     * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything.
376     *
377     * @param group the CharGroup to compute the size of.
378     * @return the maximum size of the group.
379     */
380    private static int getCharGroupMaximumSize(CharGroup group) {
381        int size = getGroupCharactersSize(group) + GROUP_FLAGS_SIZE;
382        // If terminal, one byte for the frequency
383        if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE;
384        size += GROUP_MAX_ADDRESS_SIZE; // For children address
385        size += getShortcutListSize(group.mShortcutTargets);
386        if (null != group.mBigrams) {
387            size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE)
388                    * group.mBigrams.size();
389        }
390        return size;
391    }
392
393    /**
394     * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches
395     * it in the 'actualSize' member of the node.
396     *
397     * @param node the node to compute the maximum size of.
398     */
399    private static void setNodeMaximumSize(Node node) {
400        int size = getGroupCountSize(node);
401        for (CharGroup g : node.mData) {
402            final int groupSize = getCharGroupMaximumSize(g);
403            g.mCachedSize = groupSize;
404            size += groupSize;
405        }
406        node.mCachedSize = size;
407    }
408
409    /**
410     * Helper method to hide the actual value of the no children address.
411     */
412    private static boolean hasChildrenAddress(int address) {
413        return NO_CHILDREN_ADDRESS != address;
414    }
415
416    /**
417     * Compute the size, in bytes, that an address will occupy.
418     *
419     * This can be used either for children addresses (which are always positive) or for
420     * attribute, which may be positive or negative but
421     * store their sign bit separately.
422     *
423     * @param address the address
424     * @return the byte size.
425     */
426    private static int getByteSize(int address) {
427        assert(address < 0x1000000);
428        if (!hasChildrenAddress(address)) {
429            return 0;
430        } else if (Math.abs(address) < 0x100) {
431            return 1;
432        } else if (Math.abs(address) < 0x10000) {
433            return 2;
434        } else {
435            return 3;
436        }
437    }
438    // End utility methods.
439
440    // This method is responsible for finding a nice ordering of the nodes that favors run-time
441    // cache performance and dictionary size.
442    /* package for tests */ static ArrayList<Node> flattenTree(Node root) {
443        final int treeSize = FusionDictionary.countCharGroups(root);
444        MakedictLog.i("Counted nodes : " + treeSize);
445        final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize);
446        return flattenTreeInner(flatTree, root);
447    }
448
449    private static ArrayList<Node> flattenTreeInner(ArrayList<Node> list, Node node) {
450        // Removing the node is necessary if the tails are merged, because we would then
451        // add the same node several times when we only want it once. A number of places in
452        // the code also depends on any node being only once in the list.
453        // Merging tails can only be done if there are no attributes. Searching for attributes
454        // in LatinIME code depends on a total breadth-first ordering, which merging tails
455        // breaks. If there are no attributes, it should be fine (and reduce the file size)
456        // to merge tails, and the following step would be necessary.
457        // If eventually the code runs on Android, searching through the whole array each time
458        // may be a performance concern.
459        list.remove(node);
460        list.add(node);
461        final ArrayList<CharGroup> branches = node.mData;
462        final int nodeSize = branches.size();
463        for (CharGroup group : branches) {
464            if (null != group.mChildren) flattenTreeInner(list, group.mChildren);
465        }
466        return list;
467    }
468
469    /**
470     * Finds the absolute address of a word in the dictionary.
471     *
472     * @param dict the dictionary in which to search.
473     * @param word the word we are searching for.
474     * @return the word address. If it is not found, an exception is thrown.
475     */
476    private static int findAddressOfWord(final FusionDictionary dict, final String word) {
477        return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress;
478    }
479
480    /**
481     * Computes the actual node size, based on the cached addresses of the children nodes.
482     *
483     * Each node stores its tentative address. During dictionary address computing, these
484     * are not final, but they can be used to compute the node size (the node size depends
485     * on the address of the children because the number of bytes necessary to store an
486     * address depends on its numeric value.
487     *
488     * @param node the node to compute the size of.
489     * @param dict the dictionary in which the word/attributes are to be found.
490     */
491    private static void computeActualNodeSize(Node node, FusionDictionary dict) {
492        int size = getGroupCountSize(node);
493        for (CharGroup group : node.mData) {
494            int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group);
495            if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE;
496            if (null != group.mChildren) {
497                final int offsetBasePoint= groupSize + node.mCachedAddress + size;
498                final int offset = group.mChildren.mCachedAddress - offsetBasePoint;
499                groupSize += getByteSize(offset);
500            }
501            groupSize += getShortcutListSize(group.mShortcutTargets);
502            if (null != group.mBigrams) {
503                for (WeightedString bigram : group.mBigrams) {
504                    final int offsetBasePoint = groupSize + node.mCachedAddress + size
505                            + GROUP_FLAGS_SIZE;
506                    final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
507                    final int offset = addressOfBigram - offsetBasePoint;
508                    groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE;
509                }
510            }
511            group.mCachedSize = groupSize;
512            size += groupSize;
513        }
514        node.mCachedSize = size;
515    }
516
517    /**
518     * Computes the byte size of a list of nodes and updates each node cached position.
519     *
520     * @param flatNodes the array of nodes.
521     * @return the byte size of the entire stack.
522     */
523    private static int stackNodes(ArrayList<Node> flatNodes) {
524        int nodeOffset = 0;
525        for (Node n : flatNodes) {
526            n.mCachedAddress = nodeOffset;
527            int groupCountSize = getGroupCountSize(n);
528            int groupOffset = 0;
529            for (CharGroup g : n.mData) {
530                g.mCachedAddress = groupCountSize + nodeOffset + groupOffset;
531                groupOffset += g.mCachedSize;
532            }
533            if (groupOffset + groupCountSize != n.mCachedSize) {
534                throw new RuntimeException("Bug : Stored and computed node size differ");
535            }
536            nodeOffset += n.mCachedSize;
537        }
538        return nodeOffset;
539    }
540
541    /**
542     * Compute the addresses and sizes of an ordered node array.
543     *
544     * This method takes a node array and will update its cached address and size values
545     * so that they can be written into a file. It determines the smallest size each of the
546     * nodes can be given the addresses of its children and attributes, and store that into
547     * each node.
548     * The order of the node is given by the order of the array. This method makes no effort
549     * to find a good order; it only mechanically computes the size this order results in.
550     *
551     * @param dict the dictionary
552     * @param flatNodes the ordered array of nodes
553     * @return the same array it was passed. The nodes have been updated for address and size.
554     */
555    private static ArrayList<Node> computeAddresses(FusionDictionary dict,
556            ArrayList<Node> flatNodes) {
557        // First get the worst sizes and offsets
558        for (Node n : flatNodes) setNodeMaximumSize(n);
559        final int offset = stackNodes(flatNodes);
560
561        MakedictLog.i("Compressing the array addresses. Original size : " + offset);
562        MakedictLog.i("(Recursively seen size : " + offset + ")");
563
564        int passes = 0;
565        boolean changesDone = false;
566        do {
567            changesDone = false;
568            for (Node n : flatNodes) {
569                final int oldNodeSize = n.mCachedSize;
570                computeActualNodeSize(n, dict);
571                final int newNodeSize = n.mCachedSize;
572                if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!");
573                if (oldNodeSize != newNodeSize) changesDone = true;
574            }
575            stackNodes(flatNodes);
576            ++passes;
577        } while (changesDone);
578
579        final Node lastNode = flatNodes.get(flatNodes.size() - 1);
580        MakedictLog.i("Compression complete in " + passes + " passes.");
581        MakedictLog.i("After address compression : "
582                + (lastNode.mCachedAddress + lastNode.mCachedSize));
583
584        return flatNodes;
585    }
586
587    /**
588     * Sanity-checking method.
589     *
590     * This method checks an array of node for juxtaposition, that is, it will do
591     * nothing if each node's cached address is actually the previous node's address
592     * plus the previous node's size.
593     * If this is not the case, it will throw an exception.
594     *
595     * @param array the array node to check
596     */
597    private static void checkFlatNodeArray(ArrayList<Node> array) {
598        int offset = 0;
599        int index = 0;
600        for (Node n : array) {
601            if (n.mCachedAddress != offset) {
602                throw new RuntimeException("Wrong address for node " + index
603                        + " : expected " + offset + ", got " + n.mCachedAddress);
604            }
605            ++index;
606            offset += n.mCachedSize;
607        }
608    }
609
610    /**
611     * Helper method to write a variable-size address to a file.
612     *
613     * @param buffer the buffer to write to.
614     * @param index the index in the buffer to write the address to.
615     * @param address the address to write.
616     * @return the size in bytes the address actually took.
617     */
618    private static int writeVariableAddress(final byte[] buffer, int index, final int address) {
619        switch (getByteSize(address)) {
620        case 1:
621            buffer[index++] = (byte)address;
622            return 1;
623        case 2:
624            buffer[index++] = (byte)(0xFF & (address >> 8));
625            buffer[index++] = (byte)(0xFF & address);
626            return 2;
627        case 3:
628            buffer[index++] = (byte)(0xFF & (address >> 16));
629            buffer[index++] = (byte)(0xFF & (address >> 8));
630            buffer[index++] = (byte)(0xFF & address);
631            return 3;
632        case 0:
633            return 0;
634        default:
635            throw new RuntimeException("Address " + address + " has a strange size");
636        }
637    }
638
639    private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress,
640            final int childrenOffset) {
641        byte flags = 0;
642        if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS;
643        if (group.mFrequency >= 0) {
644            flags |= FLAG_IS_TERMINAL;
645        }
646        if (null != group.mChildren) {
647            switch (getByteSize(childrenOffset)) {
648             case 1:
649                 flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE;
650                 break;
651             case 2:
652                 flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES;
653                 break;
654             case 3:
655                 flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
656                 break;
657             default:
658                 throw new RuntimeException("Node with a strange address");
659             }
660        }
661        if (null != group.mShortcutTargets) {
662            if (0 == group.mShortcutTargets.size()) {
663                throw new RuntimeException("0-sized shortcut list must be null");
664            }
665            flags |= FLAG_HAS_SHORTCUT_TARGETS;
666        }
667        if (null != group.mBigrams) {
668            if (0 == group.mBigrams.size()) {
669                throw new RuntimeException("0-sized bigram list must be null");
670            }
671            flags |= FLAG_HAS_BIGRAMS;
672        }
673        return flags;
674    }
675
676    /**
677     * Makes the flag value for an attribute.
678     *
679     * @param more whether there are more attributes after this one.
680     * @param offset the offset of the attribute.
681     * @param frequency the frequency of the attribute, 0..15
682     * @return the flags
683     */
684    private static final int makeAttributeFlags(final boolean more, final int offset,
685            final int frequency) {
686        int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0)
687                + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0);
688        switch (getByteSize(offset)) {
689        case 1:
690            bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
691            break;
692        case 2:
693            bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
694            break;
695        case 3:
696            bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
697            break;
698        default:
699            throw new RuntimeException("Strange offset size");
700        }
701        bigramFlags += frequency & FLAG_ATTRIBUTE_FREQUENCY;
702        return bigramFlags;
703    }
704
705    /**
706     * Makes the flag value for a shortcut.
707     *
708     * @param more whether there are more attributes after this one.
709     * @param frequency the frequency of the attribute, 0..15
710     * @return the flags
711     */
712    private static final int makeShortcutFlags(final boolean more, final int frequency) {
713        return (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FLAG_ATTRIBUTE_FREQUENCY);
714    }
715
716    /**
717     * Write a node to memory. The node is expected to have its final position cached.
718     *
719     * This can be an empty map, but the more is inside the faster the lookups will be. It can
720     * be carried on as long as nodes do not move.
721     *
722     * @param dict the dictionary the node is a part of (for relative offsets).
723     * @param buffer the memory buffer to write to.
724     * @param node the node to write.
725     * @return the address of the END of the node.
726     */
727    private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) {
728        int index = node.mCachedAddress;
729
730        final int groupCount = node.mData.size();
731        final int countSize = getGroupCountSize(node);
732        if (1 == countSize) {
733            buffer[index++] = (byte)groupCount;
734        } else if (2 == countSize) {
735            // We need to signal 2-byte size by setting the top bit of the MSB to 1, so
736            // we | 0x80 to do this.
737            buffer[index++] = (byte)((groupCount >> 8) | 0x80);
738            buffer[index++] = (byte)(groupCount & 0xFF);
739        } else {
740            throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
741        }
742        int groupAddress = index;
743        for (int i = 0; i < groupCount; ++i) {
744            CharGroup group = node.mData.get(i);
745            if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not "
746                    + "the same as the cached address of the group : "
747                    + index + " <> " + group.mCachedAddress);
748            groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group);
749            // Sanity checks.
750            if (group.mFrequency > MAX_TERMINAL_FREQUENCY) {
751                throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY
752                        + " : " + group.mFrequency);
753            }
754            if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE;
755            final int childrenOffset = null == group.mChildren
756                    ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress;
757            byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset);
758            buffer[index++] = flags;
759            index = CharEncoding.writeCharArray(group.mChars, buffer, index);
760            if (group.hasSeveralChars()) {
761                buffer[index++] = GROUP_CHARACTERS_TERMINATOR;
762            }
763            if (group.mFrequency >= 0) {
764                buffer[index++] = (byte) group.mFrequency;
765            }
766            final int shift = writeVariableAddress(buffer, index, childrenOffset);
767            index += shift;
768            groupAddress += shift;
769
770            // Write shortcuts
771            if (null != group.mShortcutTargets) {
772                final int indexOfShortcutByteSize = index;
773                index += GROUP_SHORTCUT_LIST_SIZE_SIZE;
774                groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE;
775                final Iterator shortcutIterator = group.mShortcutTargets.iterator();
776                while (shortcutIterator.hasNext()) {
777                    final WeightedString target = (WeightedString)shortcutIterator.next();
778                    ++groupAddress;
779                    int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(),
780                            target.mFrequency);
781                    buffer[index++] = (byte)shortcutFlags;
782                    final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord);
783                    index += shortcutShift;
784                    groupAddress += shortcutShift;
785                }
786                final int shortcutByteSize = index - indexOfShortcutByteSize;
787                if (shortcutByteSize > 0xFFFF) {
788                    throw new RuntimeException("Shortcut list too large");
789                }
790                buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8);
791                buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF);
792            }
793            // Write bigrams
794            if (null != group.mBigrams) {
795                final Iterator bigramIterator = group.mBigrams.iterator();
796                while (bigramIterator.hasNext()) {
797                    final WeightedString bigram = (WeightedString)bigramIterator.next();
798                    final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
799                    ++groupAddress;
800                    final int offset = addressOfBigram - groupAddress;
801                    int bigramFlags = makeAttributeFlags(bigramIterator.hasNext(), offset,
802                            bigram.mFrequency);
803                    buffer[index++] = (byte)bigramFlags;
804                    final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
805                    index += bigramShift;
806                    groupAddress += bigramShift;
807                }
808            }
809
810        }
811        if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException(
812                "Not the same size : written "
813                + (index - node.mCachedAddress) + " bytes out of a node that should have "
814                + node.mCachedSize + " bytes");
815        return index;
816    }
817
818    /**
819     * Dumps a collection of useful statistics about a node array.
820     *
821     * This prints purely informative stuff, like the total estimated file size, the
822     * number of nodes, of character groups, the repartition of each address size, etc
823     *
824     * @param nodes the node array.
825     */
826    private static void showStatistics(ArrayList<Node> nodes) {
827        int firstTerminalAddress = Integer.MAX_VALUE;
828        int lastTerminalAddress = Integer.MIN_VALUE;
829        int size = 0;
830        int charGroups = 0;
831        int maxGroups = 0;
832        int maxRuns = 0;
833        for (Node n : nodes) {
834            if (maxGroups < n.mData.size()) maxGroups = n.mData.size();
835            for (CharGroup cg : n.mData) {
836                ++charGroups;
837                if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length;
838                if (cg.mFrequency >= 0) {
839                    if (n.mCachedAddress < firstTerminalAddress)
840                        firstTerminalAddress = n.mCachedAddress;
841                    if (n.mCachedAddress > lastTerminalAddress)
842                        lastTerminalAddress = n.mCachedAddress;
843                }
844            }
845            if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize;
846        }
847        final int[] groupCounts = new int[maxGroups + 1];
848        final int[] runCounts = new int[maxRuns + 1];
849        for (Node n : nodes) {
850            ++groupCounts[n.mData.size()];
851            for (CharGroup cg : n.mData) {
852                ++runCounts[cg.mChars.length];
853            }
854        }
855
856        MakedictLog.i("Statistics:\n"
857                + "  total file size " + size + "\n"
858                + "  " + nodes.size() + " nodes\n"
859                + "  " + charGroups + " groups (" + ((float)charGroups / nodes.size())
860                        + " groups per node)\n"
861                + "  first terminal at " + firstTerminalAddress + "\n"
862                + "  last terminal at " + lastTerminalAddress + "\n"
863                + "  Group stats : max = " + maxGroups);
864        for (int i = 0; i < groupCounts.length; ++i) {
865            MakedictLog.i("    " + i + " : " + groupCounts[i]);
866        }
867        MakedictLog.i("  Character run stats : max = " + maxRuns);
868        for (int i = 0; i < runCounts.length; ++i) {
869            MakedictLog.i("    " + i + " : " + runCounts[i]);
870        }
871    }
872
873    /**
874     * Dumps a FusionDictionary to a file.
875     *
876     * This is the public entry point to write a dictionary to a file.
877     *
878     * @param destination the stream to write the binary data to.
879     * @param dict the dictionary to write.
880     * @param version the version of the format to write, currently either 1 or 2.
881     */
882    public static void writeDictionaryBinary(final OutputStream destination,
883            final FusionDictionary dict, final int version)
884            throws IOException, UnsupportedFormatException {
885
886        // Addresses are limited to 3 bytes, so we'll just make a 16MB buffer. Since addresses
887        // can be relative to each node, the structure itself is not limited to 16MB at all, but
888        // I doubt this will ever be shot. If it is, deciding the order of the nodes becomes
889        // a quite complicated problem, because though the dictionary itself does not have a
890        // size limit, each node must still be within 16MB of all its children and parents.
891        // As long as this is ensured, the dictionary file may grow to any size.
892        // Anyway, to make a dictionary bigger than 16MB just increase the size of this buffer.
893        final byte[] buffer = new byte[1 << 24];
894        int index = 0;
895
896        if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) {
897            throw new UnsupportedFormatException("Requested file format version " + version
898                    + ", but this implementation only supports versions "
899                    + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION);
900        }
901
902        // The magic number in big-endian order.
903        if (version >= FIRST_VERSION_WITH_HEADER_SIZE) {
904            // Magic number for version 2+.
905            buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24));
906            buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16));
907            buffer[index++] = (byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8));
908            buffer[index++] = (byte) (0xFF & VERSION_2_MAGIC_NUMBER);
909            // Dictionary version.
910            buffer[index++] = (byte) (0xFF & (version >> 8));
911            buffer[index++] = (byte) (0xFF & version);
912        } else {
913            // Magic number for version 1.
914            buffer[index++] = (byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8));
915            buffer[index++] = (byte) (0xFF & VERSION_1_MAGIC_NUMBER);
916            // Dictionary version.
917            buffer[index++] = (byte) (0xFF & version);
918        }
919        // Options flags
920        buffer[index++] = (byte) (0xFF & (OPTIONS >> 8));
921        buffer[index++] = (byte) (0xFF & OPTIONS);
922        if (version >= FIRST_VERSION_WITH_HEADER_SIZE) {
923            final int headerSizeOffset = index;
924            index += 4; // Size of the header size
925            // TODO: Write out the header contents here.
926            // Write out the header size.
927            buffer[headerSizeOffset] = (byte) (0xFF & (index >> 24));
928            buffer[headerSizeOffset + 1] = (byte) (0xFF & (index >> 16));
929            buffer[headerSizeOffset + 2] = (byte) (0xFF & (index >> 8));
930            buffer[headerSizeOffset + 3] = (byte) (0xFF & (index >> 0));
931        }
932
933        destination.write(buffer, 0, index);
934        index = 0;
935
936        // Leave the choice of the optimal node order to the flattenTree function.
937        MakedictLog.i("Flattening the tree...");
938        ArrayList<Node> flatNodes = flattenTree(dict.mRoot);
939
940        MakedictLog.i("Computing addresses...");
941        computeAddresses(dict, flatNodes);
942        MakedictLog.i("Checking array...");
943        checkFlatNodeArray(flatNodes);
944
945        MakedictLog.i("Writing file...");
946        int dataEndOffset = 0;
947        for (Node n : flatNodes) {
948            dataEndOffset = writePlacedNode(dict, buffer, n);
949        }
950
951        showStatistics(flatNodes);
952
953        destination.write(buffer, 0, dataEndOffset);
954
955        destination.close();
956        MakedictLog.i("Done");
957    }
958
959
960    // Input methods: Read a binary dictionary to memory.
961    // readDictionaryBinary is the public entry point for them.
962
963    static final int[] characterBuffer = new int[MAX_WORD_LENGTH];
964    private static CharGroupInfo readCharGroup(RandomAccessFile source,
965            final int originalGroupAddress) throws IOException {
966        int addressPointer = originalGroupAddress;
967        final int flags = source.readUnsignedByte();
968        ++addressPointer;
969        final int characters[];
970        if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) {
971            int index = 0;
972            int character = CharEncoding.readChar(source);
973            addressPointer += CharEncoding.getCharSize(character);
974            while (-1 != character) {
975                characterBuffer[index++] = character;
976                character = CharEncoding.readChar(source);
977                addressPointer += CharEncoding.getCharSize(character);
978            }
979            characters = Arrays.copyOfRange(characterBuffer, 0, index);
980        } else {
981            final int character = CharEncoding.readChar(source);
982            addressPointer += CharEncoding.getCharSize(character);
983            characters = new int[] { character };
984        }
985        final int frequency;
986        if (0 != (FLAG_IS_TERMINAL & flags)) {
987            ++addressPointer;
988            frequency = source.readUnsignedByte();
989        } else {
990            frequency = CharGroup.NOT_A_TERMINAL;
991        }
992        int childrenAddress = addressPointer;
993        switch (flags & MASK_GROUP_ADDRESS_TYPE) {
994        case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
995            childrenAddress += source.readUnsignedByte();
996            addressPointer += 1;
997            break;
998        case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
999            childrenAddress += source.readUnsignedShort();
1000            addressPointer += 2;
1001            break;
1002        case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
1003            childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort();
1004            addressPointer += 3;
1005            break;
1006        case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
1007        default:
1008            childrenAddress = NO_CHILDREN_ADDRESS;
1009            break;
1010        }
1011        ArrayList<WeightedString> shortcutTargets = null;
1012        if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) {
1013            final long pointerBefore = source.getFilePointer();
1014            shortcutTargets = new ArrayList<WeightedString>();
1015            source.readUnsignedShort(); // Skip the size
1016            while (true) {
1017                final int targetFlags = source.readUnsignedByte();
1018                final String word = CharEncoding.readString(source);
1019                shortcutTargets.add(new WeightedString(word,
1020                        targetFlags & FLAG_ATTRIBUTE_FREQUENCY));
1021                if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
1022            }
1023            addressPointer += (source.getFilePointer() - pointerBefore);
1024        }
1025        ArrayList<PendingAttribute> bigrams = null;
1026        if (0 != (flags & FLAG_HAS_BIGRAMS)) {
1027            bigrams = new ArrayList<PendingAttribute>();
1028            while (true) {
1029                final int bigramFlags = source.readUnsignedByte();
1030                ++addressPointer;
1031                final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1;
1032                int bigramAddress = addressPointer;
1033                switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) {
1034                case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
1035                    bigramAddress += sign * source.readUnsignedByte();
1036                    addressPointer += 1;
1037                    break;
1038                case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
1039                    bigramAddress += sign * source.readUnsignedShort();
1040                    addressPointer += 2;
1041                    break;
1042                case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
1043                    final int offset = ((source.readUnsignedByte() << 16)
1044                            + source.readUnsignedShort());
1045                    bigramAddress += sign * offset;
1046                    addressPointer += 3;
1047                    break;
1048                default:
1049                    throw new RuntimeException("Has bigrams with no address");
1050                }
1051                bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY,
1052                        bigramAddress));
1053                if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
1054            }
1055        }
1056        return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency,
1057                childrenAddress, shortcutTargets, bigrams);
1058    }
1059
1060    /**
1061     * Reads and returns the char group count out of a file and forwards the pointer.
1062     */
1063    private static int readCharGroupCount(RandomAccessFile source) throws IOException {
1064        final int msb = source.readUnsignedByte();
1065        if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) {
1066            return msb;
1067        } else {
1068            return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8)
1069                    + source.readUnsignedByte();
1070        }
1071    }
1072
1073    /**
1074     * Finds, as a string, the word at the address passed as an argument.
1075     *
1076     * @param source the file to read from.
1077     * @param headerSize the size of the header.
1078     * @param address the address to seek.
1079     * @return the word, as a string.
1080     * @throws IOException if the file can't be read.
1081     */
1082    private static String getWordAtAddress(RandomAccessFile source, long headerSize,
1083            int address) throws IOException {
1084        final long originalPointer = source.getFilePointer();
1085        source.seek(headerSize);
1086        final int count = readCharGroupCount(source);
1087        int groupOffset = getGroupCountSize(count);
1088        final StringBuilder builder = new StringBuilder();
1089        String result = null;
1090
1091        CharGroupInfo last = null;
1092        for (int i = count - 1; i >= 0; --i) {
1093            CharGroupInfo info = readCharGroup(source, groupOffset);
1094            groupOffset = info.mEndAddress;
1095            if (info.mOriginalAddress == address) {
1096                builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
1097                result = builder.toString();
1098                break; // and return
1099            }
1100            if (hasChildrenAddress(info.mChildrenAddress)) {
1101                if (info.mChildrenAddress > address) {
1102                    if (null == last) continue;
1103                    builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
1104                    source.seek(last.mChildrenAddress + headerSize);
1105                    groupOffset = last.mChildrenAddress + 1;
1106                    i = source.readUnsignedByte();
1107                    last = null;
1108                    continue;
1109                }
1110                last = info;
1111            }
1112            if (0 == i && hasChildrenAddress(last.mChildrenAddress)) {
1113                builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
1114                source.seek(last.mChildrenAddress + headerSize);
1115                groupOffset = last.mChildrenAddress + 1;
1116                i = source.readUnsignedByte();
1117                last = null;
1118                continue;
1119            }
1120        }
1121        source.seek(originalPointer);
1122        return result;
1123    }
1124
1125    /**
1126     * Reads a single node from a binary file.
1127     *
1128     * This methods reads the file at the current position of its file pointer. A node is
1129     * fully expected to start at the current position.
1130     * This will recursively read other nodes into the structure, populating the reverse
1131     * maps on the fly and using them to keep track of already read nodes.
1132     *
1133     * @param source the data file, correctly positioned at the start of a node.
1134     * @param headerSize the size, in bytes, of the file header.
1135     * @param reverseNodeMap a mapping from addresses to already read nodes.
1136     * @param reverseGroupMap a mapping from addresses to already read character groups.
1137     * @return the read node with all his children already read.
1138     */
1139    private static Node readNode(RandomAccessFile source, long headerSize,
1140            Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap)
1141            throws IOException {
1142        final int nodeOrigin = (int)(source.getFilePointer() - headerSize);
1143        final int count = readCharGroupCount(source);
1144        final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>();
1145        int groupOffset = nodeOrigin + getGroupCountSize(count);
1146        for (int i = count; i > 0; --i) {
1147            CharGroupInfo info = readCharGroup(source, groupOffset);
1148            ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets;
1149            ArrayList<WeightedString> bigrams = null;
1150            if (null != info.mBigrams) {
1151                bigrams = new ArrayList<WeightedString>();
1152                for (PendingAttribute bigram : info.mBigrams) {
1153                    final String word = getWordAtAddress(source, headerSize, bigram.mAddress);
1154                    bigrams.add(new WeightedString(word, bigram.mFrequency));
1155                }
1156            }
1157            if (hasChildrenAddress(info.mChildrenAddress)) {
1158                Node children = reverseNodeMap.get(info.mChildrenAddress);
1159                if (null == children) {
1160                    final long currentPosition = source.getFilePointer();
1161                    source.seek(info.mChildrenAddress + headerSize);
1162                    children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap);
1163                    source.seek(currentPosition);
1164                }
1165                nodeContents.add(
1166                        new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
1167                                children, false));
1168            } else {
1169                nodeContents.add(
1170                        new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
1171                                false));
1172            }
1173            groupOffset = info.mEndAddress;
1174        }
1175        final Node node = new Node(nodeContents);
1176        node.mCachedAddress = nodeOrigin;
1177        reverseNodeMap.put(node.mCachedAddress, node);
1178        return node;
1179    }
1180
1181    /**
1182     * Helper function to get the binary format version from the header.
1183     */
1184    private static int getFormatVersion(final RandomAccessFile source) throws IOException {
1185        final int magic_v1 = source.readUnsignedShort();
1186        if (VERSION_1_MAGIC_NUMBER == magic_v1) return source.readUnsignedByte();
1187        final int magic_v2 = (magic_v1 << 16) + source.readUnsignedShort();
1188        if (VERSION_2_MAGIC_NUMBER == magic_v2) return source.readUnsignedShort();
1189        return NOT_A_VERSION_NUMBER;
1190    }
1191
1192    /**
1193     * Reads a random access file and returns the memory representation of the dictionary.
1194     *
1195     * This high-level method takes a binary file and reads its contents, populating a
1196     * FusionDictionary structure. The optional dict argument is an existing dictionary to
1197     * which words from the file should be added. If it is null, a new dictionary is created.
1198     *
1199     * @param source the file to read.
1200     * @param dict an optional dictionary to add words to, or null.
1201     * @return the created (or merged) dictionary.
1202     */
1203    public static FusionDictionary readDictionaryBinary(final RandomAccessFile source,
1204            final FusionDictionary dict) throws IOException, UnsupportedFormatException {
1205        // Check file version
1206        final int version = getFormatVersion(source);
1207        if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION ) {
1208            throw new UnsupportedFormatException("This file has version " + version
1209                    + ", but this implementation does not support versions above "
1210                    + MAXIMUM_SUPPORTED_VERSION);
1211        }
1212
1213        // Read options
1214        source.readUnsignedShort();
1215
1216        final long headerSize;
1217        if (version < FIRST_VERSION_WITH_HEADER_SIZE) {
1218            headerSize = source.getFilePointer();
1219        } else {
1220            headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16)
1221                    + (source.readUnsignedByte() << 8) + source.readUnsignedByte();
1222            // read the header body
1223            source.seek(headerSize);
1224        }
1225
1226        Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>();
1227        Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>();
1228        final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping);
1229
1230        FusionDictionary newDict = new FusionDictionary(root,
1231                new FusionDictionary.DictionaryOptions());
1232        if (null != dict) {
1233            for (Word w : dict) {
1234                newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams);
1235            }
1236        }
1237
1238        return newDict;
1239    }
1240
1241    /**
1242     * Basic test to find out whether the file is a binary dictionary or not.
1243     *
1244     * Concretely this only tests the magic number.
1245     *
1246     * @param filename The name of the file to test.
1247     * @return true if it's a binary dictionary, false otherwise
1248     */
1249    public static boolean isBinaryDictionary(final String filename) {
1250        try {
1251            RandomAccessFile f = new RandomAccessFile(filename, "r");
1252            final int version = getFormatVersion(f);
1253            return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION);
1254        } catch (FileNotFoundException e) {
1255            return false;
1256        } catch (IOException e) {
1257            return false;
1258        }
1259    }
1260}
1261