1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17package com.android.inputmethod.latin.makedict; 18 19import com.android.inputmethod.latin.Constants; 20import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface; 21import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader; 22import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; 23import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup; 24 25import java.io.IOException; 26import java.util.ArrayList; 27import java.util.Map; 28import java.util.Stack; 29 30public final class BinaryDictIOUtils { 31 private static final boolean DBG = false; 32 33 private static final class Position { 34 public static final int NOT_READ_GROUPCOUNT = -1; 35 36 public int mAddress; 37 public int mNumOfCharGroup; 38 public int mPosition; 39 public int mLength; 40 41 public Position(int address, int length) { 42 mAddress = address; 43 mLength = length; 44 mNumOfCharGroup = NOT_READ_GROUPCOUNT; 45 } 46 } 47 48 /** 49 * Tours all node without recursive call. 50 */ 51 private static void readUnigramsAndBigramsBinaryInner( 52 final FusionDictionaryBufferInterface buffer, final int headerSize, 53 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 54 final Map<Integer, ArrayList<PendingAttribute>> bigrams, 55 final FormatOptions formatOptions) { 56 int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1]; 57 58 Stack<Position> stack = new Stack<Position>(); 59 int index = 0; 60 61 Position initPos = new Position(headerSize, 0); 62 stack.push(initPos); 63 64 while (!stack.empty()) { 65 Position p = stack.peek(); 66 67 if (DBG) { 68 MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" + 69 p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength); 70 } 71 72 if (buffer.position() != p.mAddress) buffer.position(p.mAddress); 73 if (index != p.mLength) index = p.mLength; 74 75 if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) { 76 p.mNumOfCharGroup = BinaryDictInputOutput.readCharGroupCount(buffer); 77 p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup); 78 p.mPosition = 0; 79 } 80 if (p.mNumOfCharGroup == 0) { 81 stack.pop(); 82 continue; 83 } 84 CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer, 85 p.mAddress - headerSize, formatOptions); 86 for (int i = 0; i < info.mCharacters.length; ++i) { 87 pushedChars[index++] = info.mCharacters[i]; 88 } 89 p.mPosition++; 90 91 final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags, 92 formatOptions); 93 if (!isMovedGroup 94 && info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word 95 words.put(info.mOriginalAddress, new String(pushedChars, 0, index)); 96 frequencies.put(info.mOriginalAddress, info.mFrequency); 97 if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams); 98 } 99 100 if (p.mPosition == p.mNumOfCharGroup) { 101 if (formatOptions.mSupportsDynamicUpdate) { 102 final int forwardLinkAddress = buffer.readUnsignedInt24(); 103 if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) { 104 // the node has a forward link. 105 p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT; 106 p.mAddress = forwardLinkAddress; 107 } else { 108 stack.pop(); 109 } 110 } else { 111 stack.pop(); 112 } 113 } else { 114 // the node has more groups. 115 p.mAddress = buffer.position(); 116 } 117 118 if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) { 119 Position childrenPos = new Position(info.mChildrenAddress + headerSize, index); 120 stack.push(childrenPos); 121 } 122 } 123 } 124 125 /** 126 * Reads unigrams and bigrams from the binary file. 127 * Doesn't make the memory representation of the dictionary. 128 * 129 * @param buffer the buffer to read. 130 * @param words the map to store the address as a key and the word as a value. 131 * @param frequencies the map to store the address as a key and the frequency as a value. 132 * @param bigrams the map to store the address as a key and the list of address as a value. 133 * @throws IOException 134 * @throws UnsupportedFormatException 135 */ 136 public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer, 137 final Map<Integer, String> words, final Map<Integer, Integer> frequencies, 138 final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException, 139 UnsupportedFormatException { 140 // Read header 141 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 142 readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams, 143 header.mFormatOptions); 144 } 145 146 /** 147 * Gets the address of the last CharGroup of the exact matching word in the dictionary. 148 * If no match is found, returns NOT_VALID_WORD. 149 * 150 * @param buffer the buffer to read. 151 * @param word the word we search for. 152 * @return the address of the terminal node. 153 * @throws IOException 154 * @throws UnsupportedFormatException 155 */ 156 public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer, 157 final String word) throws IOException, UnsupportedFormatException { 158 if (word == null) return FormatSpec.NOT_VALID_WORD; 159 if (buffer.position() != 0) buffer.position(0); 160 161 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 162 int wordPos = 0; 163 final int wordLen = word.codePointCount(0, word.length()); 164 for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) { 165 if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD; 166 167 do { 168 int groupOffset = buffer.position() - header.mHeaderSize; 169 final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer); 170 groupOffset += BinaryDictInputOutput.getGroupCountSize(charGroupCount); 171 172 boolean foundNextCharGroup = false; 173 for (int i = 0; i < charGroupCount; ++i) { 174 final int charGroupPos = buffer.position(); 175 final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer, 176 buffer.position(), header.mFormatOptions); 177 if (BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags, 178 header.mFormatOptions)) { 179 continue; 180 } 181 boolean same = true; 182 for (int p = 0, j = word.offsetByCodePoints(0, wordPos); 183 p < currentInfo.mCharacters.length; 184 ++p, j = word.offsetByCodePoints(j, 1)) { 185 if (wordPos + p >= wordLen 186 || word.codePointAt(j) != currentInfo.mCharacters[p]) { 187 same = false; 188 break; 189 } 190 } 191 192 if (same) { 193 // found the group matches the word. 194 if (wordPos + currentInfo.mCharacters.length == wordLen) { 195 if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL) { 196 return FormatSpec.NOT_VALID_WORD; 197 } else { 198 return charGroupPos; 199 } 200 } 201 wordPos += currentInfo.mCharacters.length; 202 if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) { 203 return FormatSpec.NOT_VALID_WORD; 204 } 205 foundNextCharGroup = true; 206 buffer.position(currentInfo.mChildrenAddress); 207 break; 208 } 209 groupOffset = currentInfo.mEndAddress; 210 } 211 212 // If we found the next char group, it is under the file pointer. 213 // But if not, we are at the end of this node so we expect to have 214 // a forward link address that we need to consult and possibly resume 215 // search on the next node in the linked list. 216 if (foundNextCharGroup) break; 217 if (!header.mFormatOptions.mSupportsDynamicUpdate) { 218 return FormatSpec.NOT_VALID_WORD; 219 } 220 221 final int forwardLinkAddress = buffer.readUnsignedInt24(); 222 if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) { 223 return FormatSpec.NOT_VALID_WORD; 224 } 225 buffer.position(forwardLinkAddress); 226 } while(true); 227 } 228 return FormatSpec.NOT_VALID_WORD; 229 } 230 231 /** 232 * Delete the word from the binary file. 233 * 234 * @param buffer the buffer to write. 235 * @param word the word we delete 236 * @throws IOException 237 * @throws UnsupportedFormatException 238 */ 239 public static void deleteWord(final FusionDictionaryBufferInterface buffer, 240 final String word) throws IOException, UnsupportedFormatException { 241 buffer.position(0); 242 final FileHeader header = BinaryDictInputOutput.readHeader(buffer); 243 final int wordPosition = getTerminalPosition(buffer, word); 244 if (wordPosition == FormatSpec.NOT_VALID_WORD) return; 245 246 buffer.position(wordPosition); 247 final int flags = buffer.readUnsignedByte(); 248 final int newFlags = flags ^ FormatSpec.FLAG_IS_TERMINAL; 249 buffer.position(wordPosition); 250 buffer.put((byte)newFlags); 251 } 252 253 private static void putSInt24(final FusionDictionaryBufferInterface buffer, 254 final int value) { 255 final int absValue = Math.abs(value); 256 buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF)); 257 buffer.put((byte)((absValue >> 8) & 0xFF)); 258 buffer.put((byte)(absValue & 0xFF)); 259 } 260 261 /** 262 * Update a parent address in a CharGroup that is addressed by groupOriginAddress. 263 * 264 * @param buffer the buffer to write. 265 * @param groupOriginAddress the address of the group. 266 * @param newParentAddress the absolute address of the parent. 267 * @param formatOptions file format options. 268 */ 269 public static void updateParentAddress(final FusionDictionaryBufferInterface buffer, 270 final int groupOriginAddress, final int newParentAddress, 271 final FormatOptions formatOptions) { 272 final int originalPosition = buffer.position(); 273 buffer.position(groupOriginAddress); 274 if (!formatOptions.mSupportsDynamicUpdate) { 275 throw new RuntimeException("this file format does not support parent addresses"); 276 } 277 final int flags = buffer.readUnsignedByte(); 278 final int parentOffset = newParentAddress - groupOriginAddress; 279 putSInt24(buffer, parentOffset); 280 buffer.position(originalPosition); 281 } 282} 283