1/* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8package com.ibm.icu.text; 9 10import java.io.IOException; 11import java.nio.ByteBuffer; 12import java.text.CharacterIterator; 13 14import com.ibm.icu.impl.ICUBinary; 15 16/* 17 * This is a class used to load in the compact trie dictionary file 18 * used for dictionary based break iteration. 19 */ 20class BreakCTDictionary { 21 private CompactTrieHeader fData; 22 23 static class CompactTrieHeader { 24 int size; // Size of data in bytes 25 26 int magic; // Magic number (including versions) 27 28 int nodeCount; // Number of entries in offsets[] 29 30 int root; // Node number of the root node 31 32 int []offset; // Offsets to nodes from start of data 33 34 CompactTrieHeader() { 35 size = 0; 36 magic = 0; 37 nodeCount = 0; 38 root = 0; 39 offset = null; 40 } 41 } 42 43 static final class CompactTrieNodeFlags { 44 static final int kVerticalNode = 0x1000; // This is a vertical node 45 46 static final int kParentEndsWord = 0x2000; // The node whose equal link 47 // points to this ends a 48 // word 49 50 static final int kReservedFlag1 = 0x4000; 51 52 static final int kReservedFlag2 = 0x8000; 53 54 static final int kCountMask = 0x0FFF; // The count portion of 55 // flagscount 56 57 static final int kFlagMask = 0xF000; // The flags portion of 58 // flagscount 59 } 60 61 // The two node types are distinguished by the kVerticalNode flag. 62 static class CompactTrieHorizontalNode { 63 char ch; // UChar 64 65 int equal; // Equal link node index 66 67 CompactTrieHorizontalNode(char newCh, int newEqual) { 68 ch = newCh; 69 equal = newEqual; 70 } 71 } 72 73 static class CompactTrieVerticalNode { 74 int equal; // Equal link node index 75 76 char chars[]; // Code units 77 78 CompactTrieVerticalNode() { 79 equal = 0; 80 chars = null; 81 } 82 } 83 84 private CompactTrieNodes getCompactTrieNode(int node) { 85 return nodes[node]; 86 } 87 88 // private class to hold both node information 89 static class CompactTrieNodes { 90 short flagscount; // Count of sub-entries, plus flags 91 92 CompactTrieHorizontalNode[] hnode; 93 94 CompactTrieVerticalNode vnode; 95 96 CompactTrieNodes() { 97 flagscount = 0; 98 hnode = null; 99 vnode = null; 100 } 101 } 102 103 private CompactTrieNodes[] nodes; 104 105 // Constructor 106 public BreakCTDictionary(ByteBuffer bytes) throws IOException { 107 ICUBinary.readHeader(bytes, DATA_FORMAT_ID, null); 108 109 // Get header information 110 fData = new CompactTrieHeader(); 111 fData.size = bytes.getInt(); 112 fData.magic = bytes.getInt(); 113 fData.nodeCount = bytes.getShort(); 114 fData.root = bytes.getShort(); 115 116 loadBreakCTDictionary(bytes); 117 } 118 119 // Loads the compact trie dictionary file into the CompactTrieNodes 120 private void loadBreakCTDictionary(ByteBuffer bytes) throws IOException { 121 // skip over offset information 122 for (int i = 0; i < fData.nodeCount; i++) { 123 bytes.getInt(); 124 } 125 126 // Create compact trie dictionary 127 nodes = new CompactTrieNodes[fData.nodeCount]; 128 nodes[0] = new CompactTrieNodes(); 129 130 // Load in compact trie dictionary 131 for (int j = 1; j < fData.nodeCount; j++) { 132 nodes[j] = new CompactTrieNodes(); 133 nodes[j].flagscount = bytes.getShort(); 134 135 int count = nodes[j].flagscount & CompactTrieNodeFlags.kCountMask; 136 137 if (count != 0) { 138 boolean isVerticalNode = (nodes[j].flagscount & CompactTrieNodeFlags.kVerticalNode) != 0; 139 140 // Vertical node 141 if (isVerticalNode) { 142 nodes[j].vnode = new CompactTrieVerticalNode(); 143 nodes[j].vnode.equal = bytes.getShort(); 144 145 nodes[j].vnode.chars = new char[count]; 146 for (int l = 0; l < count; l++) { 147 nodes[j].vnode.chars[l] = bytes.getChar(); 148 } 149 } else { // Horizontal node 150 nodes[j].hnode = new CompactTrieHorizontalNode[count]; 151 for (int n = 0; n < count; n++) { 152 nodes[j].hnode[n] = new CompactTrieHorizontalNode( 153 bytes.getChar(), bytes.getShort()); 154 } 155 } 156 } 157 } 158 } 159 160 /** 161 * Find dictionary words that match the text. 162 * 163 * @param text A CharacterIterator representing the text. The iterator is 164 * left after the longest prefix match in the dictionary. 165 * @param maxLength The maximum number of code units to match. 166 * @param lengths An array that is filled with the lengths of words that matched. 167 * @param count Filled with the number of elements output in lengths. 168 * @param limit The size of the lengths array; this limits the number of words output. 169 * @return The number of characters in text that were matched. 170 */ 171 public int matches(CharacterIterator text, int maxLength, int lengths[], 172 int count[], int limit) { 173 // Current implementation works in UTF-16 space 174 CompactTrieNodes node = getCompactTrieNode(fData.root); 175 int mycount = 0; 176 177 char uc = text.current(); 178 int i = 0; 179 boolean exitFlag = false; 180 181 while (node != null) { 182 // Check if the node we just exited ends a word 183 if (limit > 0 184 && (node.flagscount & CompactTrieNodeFlags.kParentEndsWord) != 0) { 185 lengths[mycount++] = i; 186 --limit; 187 } 188 // Check that we haven't exceeded the maximum number of input 189 // characters. 190 // We have to do that here rather than in the while condition so 191 // that 192 // we can check for ending of a word above. 193 if (i >= maxLength) { 194 break; 195 } 196 197 int nodeCount = (node.flagscount & CompactTrieNodeFlags.kCountMask); 198 if (nodeCount == 0) { 199 // Special terminal node; return now 200 break; 201 } 202 if ((node.flagscount & CompactTrieNodeFlags.kVerticalNode) != 0) { 203 // Vertical node; check all the characters in it 204 CompactTrieVerticalNode vnode = node.vnode; 205 for (int j = 0; j < nodeCount && i < maxLength; j++) { 206 if (uc != vnode.chars[j]) { 207 // We hit a non equal character return; 208 exitFlag = true; 209 break; 210 } 211 text.next(); 212 uc = text.current(); 213 i++; 214 } 215 if (exitFlag) { 216 break; 217 } 218 // To get here, we must have come through the whole list successfully; 219 // go on to the next node. Note that a word cannot end in the middle 220 // of a vertical node. 221 node = getCompactTrieNode(vnode.equal); 222 } else { 223 // Horizontal node; do binary search 224 CompactTrieHorizontalNode[] hnode = node.hnode; 225 int low = 0; 226 int high = nodeCount - 1; 227 int middle; 228 node = null; // If we don't find a match, we'll fall out of the loop 229 while (high >= low) { 230 middle = (high + low) >>> 1; 231 if (uc == hnode[middle].ch) { 232 // We hit a match; get the next node and next character 233 node = getCompactTrieNode(hnode[middle].equal); 234 text.next(); 235 uc = text.current(); 236 i++; 237 break; 238 } else if (uc < hnode[middle].ch) { 239 high = middle - 1; 240 } else { 241 low = middle + 1; 242 } 243 } 244 } 245 } 246 247 count[0] = mycount; 248 return i; 249 } 250 251 // Use for reading the header portion of the file 252 private static final int DATA_FORMAT_ID = 0x54724463; 253} 254