/* * Copyright (C) 2012 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.inputmethod.latin.makedict; import com.android.inputmethod.annotations.UsedForTesting; import com.android.inputmethod.latin.Constants; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString; import com.android.inputmethod.latin.utils.ByteArrayDictBuffer; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.Map; import java.util.Stack; public final class BinaryDictIOUtils { private static final boolean DBG = false; private BinaryDictIOUtils() { // This utility class is not publicly instantiable. } private static final class Position { public static final int NOT_READ_PTNODE_COUNT = -1; public int mAddress; public int mNumOfPtNode; public int mPosition; public int mLength; public Position(int address, int length) { mAddress = address; mLength = length; mNumOfPtNode = NOT_READ_PTNODE_COUNT; } } /** * Retrieves all node arrays without recursive call. */ private static void readUnigramsAndBigramsBinaryInner(final DictDecoder dictDecoder, final int headerSize, final Map words, final Map frequencies, final Map> bigrams, final FormatOptions formatOptions) { int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; Stack stack = new Stack(); int index = 0; Position initPos = new Position(headerSize, 0); stack.push(initPos); while (!stack.empty()) { Position p = stack.peek(); if (DBG) { MakedictLog.d("read: address=" + p.mAddress + ", numOfPtNode=" + p.mNumOfPtNode + ", position=" + p.mPosition + ", length=" + p.mLength); } if (dictDecoder.getPosition() != p.mAddress) dictDecoder.setPosition(p.mAddress); if (index != p.mLength) index = p.mLength; if (p.mNumOfPtNode == Position.NOT_READ_PTNODE_COUNT) { p.mNumOfPtNode = dictDecoder.readPtNodeCount(); p.mAddress += getPtNodeCountSize(p.mNumOfPtNode); p.mPosition = 0; } if (p.mNumOfPtNode == 0) { stack.pop(); continue; } PtNodeInfo info = dictDecoder.readPtNode(p.mAddress, formatOptions); for (int i = 0; i < info.mCharacters.length; ++i) { pushedChars[index++] = info.mCharacters[i]; } p.mPosition++; final boolean isMovedPtNode = isMovedPtNode(info.mFlags, formatOptions); final boolean isDeletedPtNode = isDeletedPtNode(info.mFlags, formatOptions); if (!isMovedPtNode && !isDeletedPtNode && info.mFrequency != FusionDictionary.PtNode.NOT_A_TERMINAL) {// found word words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); frequencies.put(info.mOriginalAddress, info.mFrequency); if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); } if (p.mPosition == p.mNumOfPtNode) { if (formatOptions.mSupportsDynamicUpdate) { final boolean hasValidForwardLinkAddress = dictDecoder.readAndFollowForwardLink(); if (hasValidForwardLinkAddress && dictDecoder.hasNextPtNodeArray()) { // The node array has a forward link. p.mNumOfPtNode = Position.NOT_READ_PTNODE_COUNT; p.mAddress = dictDecoder.getPosition(); } else { stack.pop(); } } else { stack.pop(); } } else { // The Ptnode array has more PtNodes. p.mAddress = dictDecoder.getPosition(); } if (!isMovedPtNode && hasChildrenAddress(info.mChildrenAddress)) { final Position childrenPos = new Position(info.mChildrenAddress, index); stack.push(childrenPos); } } } /** * Reads unigrams and bigrams from the binary file. * Doesn't store a full memory representation of the dictionary. * * @param dictDecoder the dict decoder. * @param words the map to store the address as a key and the word as a value. * @param frequencies the map to store the address as a key and the frequency as a value. * @param bigrams the map to store the address as a key and the list of address as a value. * @throws IOException if the file can't be read. * @throws UnsupportedFormatException if the format of the file is not recognized. */ /* package */ static void readUnigramsAndBigramsBinary(final DictDecoder dictDecoder, final Map words, final Map frequencies, final Map> bigrams) throws IOException, UnsupportedFormatException { // Read header final FileHeader header = dictDecoder.readHeader(); readUnigramsAndBigramsBinaryInner(dictDecoder, header.mHeaderSize, words, frequencies, bigrams, header.mFormatOptions); } /** * Gets the address of the last PtNode of the exact matching word in the dictionary. * If no match is found, returns NOT_VALID_WORD. * * @param dictDecoder the dict decoder. * @param word the word we search for. * @return the address of the terminal node. * @throws IOException if the file can't be read. * @throws UnsupportedFormatException if the format of the file is not recognized. */ @UsedForTesting /* package */ static int getTerminalPosition(final DictDecoder dictDecoder, final String word) throws IOException, UnsupportedFormatException { if (word == null) return FormatSpec.NOT_VALID_WORD; dictDecoder.setPosition(0); final FileHeader header = dictDecoder.readHeader(); int wordPos = 0; final int wordLen = word.codePointCount(0, word.length()); for (int depth = 0; depth < Constants.DICTIONARY_MAX_WORD_LENGTH; ++depth) { if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; do { final int ptNodeCount = dictDecoder.readPtNodeCount(); boolean foundNextPtNode = false; for (int i = 0; i < ptNodeCount; ++i) { final int ptNodePos = dictDecoder.getPosition(); final PtNodeInfo currentInfo = dictDecoder.readPtNode(ptNodePos, header.mFormatOptions); final boolean isMovedNode = isMovedPtNode(currentInfo.mFlags, header.mFormatOptions); final boolean isDeletedNode = isDeletedPtNode(currentInfo.mFlags, header.mFormatOptions); if (isMovedNode) continue; boolean same = true; for (int p = 0, j = word.offsetByCodePoints(0, wordPos); p < currentInfo.mCharacters.length; ++p, j = word.offsetByCodePoints(j, 1)) { if (wordPos + p >= wordLen || word.codePointAt(j) != currentInfo.mCharacters[p]) { same = false; break; } } if (same) { // found the PtNode matches the word. if (wordPos + currentInfo.mCharacters.length == wordLen) { if (currentInfo.mFrequency == PtNode.NOT_A_TERMINAL || isDeletedNode) { return FormatSpec.NOT_VALID_WORD; } else { return ptNodePos; } } wordPos += currentInfo.mCharacters.length; if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { return FormatSpec.NOT_VALID_WORD; } foundNextPtNode = true; dictDecoder.setPosition(currentInfo.mChildrenAddress); break; } } // If we found the next PtNode, it is under the file pointer. // But if not, we are at the end of this node array so we expect to have // a forward link address that we need to consult and possibly resume // search on the next node array in the linked list. if (foundNextPtNode) break; if (!header.mFormatOptions.mSupportsDynamicUpdate) { return FormatSpec.NOT_VALID_WORD; } final boolean hasValidForwardLinkAddress = dictDecoder.readAndFollowForwardLink(); if (!hasValidForwardLinkAddress || !dictDecoder.hasNextPtNodeArray()) { return FormatSpec.NOT_VALID_WORD; } } while(true); } return FormatSpec.NOT_VALID_WORD; } /** * @return the size written, in bytes. Always 3 bytes. */ static int writeSInt24ToBuffer(final DictBuffer dictBuffer, final int value) { final int absValue = Math.abs(value); dictBuffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); dictBuffer.put((byte)((absValue >> 8) & 0xFF)); dictBuffer.put((byte)(absValue & 0xFF)); return 3; } /** * @return the size written, in bytes. Always 3 bytes. */ static int writeSInt24ToStream(final OutputStream destination, final int value) throws IOException { final int absValue = Math.abs(value); destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); destination.write((byte)((absValue >> 8) & 0xFF)); destination.write((byte)(absValue & 0xFF)); return 3; } /** * @return the size written, in bytes. 1, 2, or 3 bytes. */ private static int writeVariableAddress(final OutputStream destination, final int value) throws IOException { switch (BinaryDictEncoderUtils.getByteSize(value)) { case 1: destination.write((byte)value); break; case 2: destination.write((byte)(0xFF & (value >> 8))); destination.write((byte)(0xFF & value)); break; case 3: destination.write((byte)(0xFF & (value >> 16))); destination.write((byte)(0xFF & (value >> 8))); destination.write((byte)(0xFF & value)); break; } return BinaryDictEncoderUtils.getByteSize(value); } static void skipString(final DictBuffer dictBuffer, final boolean hasMultipleChars) { if (hasMultipleChars) { int character = CharEncoding.readChar(dictBuffer); while (character != FormatSpec.INVALID_CHARACTER) { character = CharEncoding.readChar(dictBuffer); } } else { CharEncoding.readChar(dictBuffer); } } /** * Write a string to a stream. * * @param destination the stream to write. * @param word the string to be written. * @return the size written, in bytes. * @throws IOException */ private static int writeString(final OutputStream destination, final String word) throws IOException { int size = 0; final int length = word.length(); for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) { final int codePoint = word.codePointAt(i); if (CharEncoding.getCharSize(codePoint) == 1) { destination.write((byte)codePoint); size++; } else { destination.write((byte)(0xFF & (codePoint >> 16))); destination.write((byte)(0xFF & (codePoint >> 8))); destination.write((byte)(0xFF & codePoint)); size += 3; } } destination.write((byte)FormatSpec.PTNODE_CHARACTERS_TERMINATOR); size += FormatSpec.PTNODE_TERMINATOR_SIZE; return size; } /** * Write a PtNode to an output stream from a PtNodeInfo. * A PtNode is an in-memory representation of a node in the patricia trie. * A PtNode info is a container for low-level information about how the * PtNode is stored in the binary format. * * @param destination the stream to write. * @param info the PtNode info to be written. * @return the size written, in bytes. */ private static int writePtNode(final OutputStream destination, final PtNodeInfo info) throws IOException { int size = FormatSpec.PTNODE_FLAGS_SIZE; destination.write((byte)info.mFlags); final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ? FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress; size += writeSInt24ToStream(destination, parentOffset); for (int i = 0; i < info.mCharacters.length; ++i) { if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) { destination.write((byte)info.mCharacters[i]); size++; } else { size += writeSInt24ToStream(destination, info.mCharacters[i]); } } if (info.mCharacters.length > 1) { destination.write((byte)FormatSpec.PTNODE_CHARACTERS_TERMINATOR); size++; } if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { destination.write((byte)info.mFrequency); size++; } if (DBG) { MakedictLog.d("writePtNode origin=" + info.mOriginalAddress + ", size=" + size + ", child=" + info.mChildrenAddress + ", characters =" + new String(info.mCharacters, 0, info.mCharacters.length)); } final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ? 0 : info.mChildrenAddress - (info.mOriginalAddress + size); writeSInt24ToStream(destination, childrenOffset); size += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) { final int shortcutListSize = BinaryDictEncoderUtils.getShortcutListSize(info.mShortcutTargets); destination.write((byte)(shortcutListSize >> 8)); destination.write((byte)(shortcutListSize & 0xFF)); size += 2; final Iterator shortcutIterator = info.mShortcutTargets.iterator(); while (shortcutIterator.hasNext()) { final WeightedString target = shortcutIterator.next(); destination.write((byte)BinaryDictEncoderUtils.makeShortcutFlags( shortcutIterator.hasNext(), target.mFrequency)); size++; size += writeString(destination, target.mWord); } } if (info.mBigrams != null) { // TODO: Consolidate this code with the code that computes the size of the bigram list // in BinaryDictEncoderUtils#computeActualNodeArraySize for (int i = 0; i < info.mBigrams.size(); ++i) { final int bigramFrequency = info.mBigrams.get(i).mFrequency; int bigramFlags = (i < info.mBigrams.size() - 1) ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0; size++; final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress + size); bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0; switch (BinaryDictEncoderUtils.getByteSize(bigramOffset)) { case 1: bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE; break; case 2: bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES; break; case 3: bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES; break; } bigramFlags |= bigramFrequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY; destination.write((byte)bigramFlags); size += writeVariableAddress(destination, Math.abs(bigramOffset)); } } return size; } /** * Compute the size of the PtNode. */ static int computePtNodeSize(final PtNodeInfo info, final FormatOptions formatOptions) { int size = FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE + BinaryDictEncoderUtils.getPtNodeCharactersSize(info.mCharacters) + getChildrenAddressSize(info.mFlags, formatOptions); if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) { size += FormatSpec.PTNODE_FREQUENCY_SIZE; } if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) { size += BinaryDictEncoderUtils.getShortcutListSize(info.mShortcutTargets); } if (info.mBigrams != null) { for (final PendingAttribute attr : info.mBigrams) { size += FormatSpec.PTNODE_FLAGS_SIZE; size += BinaryDictEncoderUtils.getByteSize(attr.mAddress); } } return size; } /** * Write a node array to the stream. * * @param destination the stream to write. * @param infos an array of PtNodeInfo to be written. * @return the size written, in bytes. * @throws IOException */ static int writeNodes(final OutputStream destination, final PtNodeInfo[] infos) throws IOException { int size = getPtNodeCountSize(infos.length); switch (getPtNodeCountSize(infos.length)) { case 1: destination.write((byte)infos.length); break; case 2: destination.write((byte)(infos.length >> 8)); destination.write((byte)(infos.length & 0xFF)); break; default: throw new RuntimeException("Invalid node count size."); } for (final PtNodeInfo info : infos) size += writePtNode(destination, info); writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS); return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE; } private static final int HEADER_READING_BUFFER_SIZE = 16384; /** * Convenience method to read the header of a binary file. * * This is quite resource intensive - don't call when performance is critical. * * @param file The file to read. * @param offset The offset in the file where to start reading the data. * @param length The length of the data file. */ private static FileHeader getDictionaryFileHeader( final File file, final long offset, final long length) throws FileNotFoundException, IOException, UnsupportedFormatException { final byte[] buffer = new byte[HEADER_READING_BUFFER_SIZE]; final DictDecoder dictDecoder = FormatSpec.getDictDecoder(file, new DictDecoder.DictionaryBufferFactory() { @Override public DictBuffer getDictionaryBuffer(File file) throws FileNotFoundException, IOException { final FileInputStream inStream = new FileInputStream(file); try { inStream.skip(offset); inStream.read(buffer); return new ByteArrayDictBuffer(buffer); } finally { inStream.close(); } } } ); return dictDecoder.readHeader(); } public static FileHeader getDictionaryFileHeaderOrNull(final File file, final long offset, final long length) { try { final FileHeader header = getDictionaryFileHeader(file, offset, length); return header; } catch (UnsupportedFormatException e) { return null; } catch (IOException e) { return null; } } /** * Helper method to hide the actual value of the no children address. */ public static boolean hasChildrenAddress(final int address) { return FormatSpec.NO_CHILDREN_ADDRESS != address; } /** * Helper method to check whether the node is moved. */ public static boolean isMovedPtNode(final int flags, final FormatOptions options) { return options.mSupportsDynamicUpdate && ((flags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) == FormatSpec.FLAG_IS_MOVED); } /** * Helper method to check whether the dictionary can be updated dynamically. */ public static boolean supportsDynamicUpdate(final FormatOptions options) { return options.mVersion >= FormatSpec.FIRST_VERSION_WITH_DYNAMIC_UPDATE && options.mSupportsDynamicUpdate; } /** * Helper method to check whether the node is deleted. */ public static boolean isDeletedPtNode(final int flags, final FormatOptions formatOptions) { return formatOptions.mSupportsDynamicUpdate && ((flags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) == FormatSpec.FLAG_IS_DELETED); } /** * Compute the binary size of the node count * @param count the node count * @return the size of the node count, either 1 or 2 bytes. */ public static int getPtNodeCountSize(final int count) { if (FormatSpec.MAX_PTNODES_FOR_ONE_BYTE_PTNODE_COUNT >= count) { return 1; } else if (FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY >= count) { return 2; } else { throw new RuntimeException("Can't have more than " + FormatSpec.MAX_PTNODES_IN_A_PT_NODE_ARRAY + " PtNode in a PtNodeArray (found " + count + ")"); } } static int getChildrenAddressSize(final int optionFlags, final FormatOptions formatOptions) { if (formatOptions.mSupportsDynamicUpdate) return FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE; switch (optionFlags & FormatSpec.MASK_CHILDREN_ADDRESS_TYPE) { case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE: return 1; case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES: return 2; case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES: return 3; case FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS: default: return 0; } } /** * Calculate bigram frequency from compressed value * * @param unigramFrequency * @param bigramFrequency compressed frequency * @return approximate bigram frequency */ public static int reconstructBigramFrequency(final int unigramFrequency, final int bigramFrequency) { final float stepSize = (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY); final float resultFreqFloat = unigramFrequency + stepSize * (bigramFrequency + 1.0f); return (int)resultFreqFloat; } }