1/* 2******************************************************************************* 3* Copyright (C) 2003-2011, International Business Machines Corporation and * 4* others. All Rights Reserved. * 5******************************************************************************* 6*/ 7package com.ibm.icu.text; 8 9import java.io.IOException; 10import java.io.OutputStream; 11import java.util.ArrayList; 12import java.util.List; 13 14import com.ibm.icu.impl.Assert; 15import com.ibm.icu.impl.IntTrieBuilder; 16 17// 18// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules 19// (part of the rule building process.) 20// 21// Starting with the rules parse tree from the scanner, 22// 23// - Enumerate the set of UnicodeSets that are referenced 24// by the RBBI rules. 25// - compute a set of non-overlapping character ranges 26// with all characters within a range belonging to the same 27// set of input uniocde sets. 28// - Derive a set of non-overlapping UnicodeSet (like things) 29// that will correspond to columns in the state table for 30// the RBBI execution engine. All characters within one 31// of these sets belong to the same set of the original 32// UnicodeSets from the user's rules. 33// - construct the trie table that maps input characters 34// to the index of the matching non-overlapping set of set from 35// the previous step. 36// 37class RBBISetBuilder { 38 static class RangeDescriptor { 39 int fStartChar; // Start of range, unicode 32 bit value. 40 int fEndChar; // End of range, unicode 32 bit value. 41 int fNum; // runtime-mapped input value for this range. 42 List<RBBINode> fIncludesSets; // vector of the the original 43 // Unicode sets that include this range. 44 // (Contains ptrs to uset nodes) 45 RangeDescriptor fNext; // Next RangeDescriptor in the linked list. 46 47 RangeDescriptor() { 48 fIncludesSets = new ArrayList<RBBINode>(); 49 } 50 51 RangeDescriptor(RangeDescriptor other) { 52 fStartChar = other.fStartChar; 53 fEndChar = other.fEndChar; 54 fNum = other.fNum; 55 fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets); 56 } 57 58 //------------------------------------------------------------------------------------- 59 // 60 // RangeDesriptor::split() 61 // 62 //------------------------------------------------------------------------------------- 63 void split(int where) { 64 Assert.assrt(where>fStartChar && where<=fEndChar); 65 RangeDescriptor nr = new RangeDescriptor(this); 66 67 // RangeDescriptor copy constructor copies all fields. 68 // Only need to update those that are different after the split. 69 nr.fStartChar = where; 70 this.fEndChar = where-1; 71 nr.fNext = this.fNext; 72 this.fNext = nr; 73 74 // TODO: fIncludesSets is not updated. Check it out. 75 // Probably because they haven't been populated yet, 76 // but still sloppy. 77 } 78 79 80 //------------------------------------------------------------------------------------- 81 // 82 // RangeDescriptor::setDictionaryFlag 83 // 84 // Character Category Numbers that include characters from 85 // the original Unicode Set named "dictionary" have bit 14 86 // set to 1. The RBBI runtime engine uses this to trigger 87 // use of the word dictionary. 88 // 89 // This function looks through the Unicode Sets that it 90 // (the range) includes, and sets the bit in fNum when 91 // "dictionary" is among them. 92 // 93 // TODO: a faster way would be to find the set node for 94 // "dictionary" just once, rather than looking it 95 // up by name every time. 96 // 97 // ------------------------------------------------------------------------------------- 98 void setDictionaryFlag() { 99 int i; 100 101 for (i=0; i<this.fIncludesSets.size(); i++) { 102 RBBINode usetNode = fIncludesSets.get(i); 103 String setName = ""; 104 RBBINode setRef = usetNode.fParent; 105 if (setRef != null) { 106 RBBINode varRef = setRef.fParent; 107 if (varRef != null && varRef.fType == RBBINode.varRef) { 108 setName = varRef.fText; 109 } 110 } 111 if (setName.equals("dictionary")) { 112 this.fNum |= 0x4000; 113 break; 114 } 115 } 116 117 } 118 } 119 120 121 RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us. 122 RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors 123 124 IntTrieBuilder fTrie; // The mapping TRIE that is the end result of processing 125 int fTrieSize; // the Unicode Sets. 126 127 // Groups correspond to character categories - 128 // groups of ranges that are in the same original UnicodeSets. 129 // fGroupCount is the index of the last used group. 130 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 131 // State table column 0 is not used. Column 1 is for end-of-input. 132 // column 2 is for group 0. Funny counting. 133 int fGroupCount; 134 135 boolean fSawBOF; 136 137 138 //------------------------------------------------------------------------ 139 // 140 // RBBISetBuilder Constructor 141 // 142 //------------------------------------------------------------------------ 143 RBBISetBuilder(RBBIRuleBuilder rb) 144 { 145 fRB = rb; 146 } 147 148 149 //------------------------------------------------------------------------ 150 // 151 // build Build the list of non-overlapping character ranges 152 // from the Unicode Sets. 153 // 154 //------------------------------------------------------------------------ 155 void build() { 156 RangeDescriptor rlRange; 157 158 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();} 159 160 // Initialize the process by creating a single range encompassing all characters 161 // that is in no sets. 162 // 163 fRangeList = new RangeDescriptor(); 164 fRangeList.fStartChar = 0; 165 fRangeList.fEndChar = 0x10ffff; 166 167 // 168 // Find the set of non-overlapping ranges of characters 169 // 170 for (RBBINode usetNode : fRB.fUSetNodes) { 171 UnicodeSet inputSet = usetNode.fInputSet; 172 int inputSetRangeCount = inputSet.getRangeCount(); 173 int inputSetRangeIndex = 0; 174 rlRange = fRangeList; 175 176 for (;;) { 177 if (inputSetRangeIndex >= inputSetRangeCount) { 178 break; 179 } 180 int inputSetRangeBegin = inputSet.getRangeStart(inputSetRangeIndex); 181 int inputSetRangeEnd = inputSet.getRangeEnd(inputSetRangeIndex); 182 183 // skip over ranges from the range list that are completely 184 // below the current range from the input unicode set. 185 while (rlRange.fEndChar < inputSetRangeBegin) { 186 rlRange = rlRange.fNext; 187 } 188 189 // If the start of the range from the range list is before with 190 // the start of the range from the unicode set, split the range list range 191 // in two, with one part being before (wholly outside of) the unicode set 192 // and the other containing the rest. 193 // Then continue the loop; the post-split current range will then be skipped 194 // over 195 if (rlRange.fStartChar < inputSetRangeBegin) { 196 rlRange.split(inputSetRangeBegin); 197 continue; 198 } 199 200 // Same thing at the end of the ranges... 201 // If the end of the range from the range list doesn't coincide with 202 // the end of the range from the unicode set, split the range list 203 // range in two. The first part of the split range will be 204 // wholly inside the Unicode set. 205 if (rlRange.fEndChar > inputSetRangeEnd) { 206 rlRange.split(inputSetRangeEnd+1); 207 } 208 209 // The current rlRange is now entirely within the UnicodeSet range. 210 // Add this unicode set to the list of sets for this rlRange 211 if (rlRange.fIncludesSets.indexOf(usetNode) == -1) { 212 rlRange.fIncludesSets.add(usetNode); 213 } 214 215 // Advance over ranges that we are finished with. 216 if (inputSetRangeEnd == rlRange.fEndChar) { 217 inputSetRangeIndex++; 218 } 219 rlRange = rlRange.fNext; 220 } 221 } 222 223 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();} 224 225 // 226 // Group the above ranges, with each group consisting of one or more 227 // ranges that are in exactly the same set of original UnicodeSets. 228 // The groups are numbered, and these group numbers are the set of 229 // input symbols recognized by the run-time state machine. 230 // 231 // Numbering: # 0 (state table column 0) is unused. 232 // # 1 is reserved - table column 1 is for end-of-input 233 // # 2 is reserved - table column 2 is for beginning-in-input 234 // # 3 is the first range list. 235 // 236 RangeDescriptor rlSearchRange; 237 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 238 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) { 239 if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) { 240 rlRange.fNum = rlSearchRange.fNum; 241 break; 242 } 243 } 244 if (rlRange.fNum == 0) { 245 fGroupCount ++; 246 rlRange.fNum = fGroupCount+2; 247 rlRange.setDictionaryFlag(); 248 addValToSets(rlRange.fIncludesSets, fGroupCount+2); 249 } 250 } 251 252 // Handle input sets that contain the special string {eof}. 253 // Column 1 of the state table is reserved for EOF on input. 254 // Column 2 is reserved for before-the-start-input. 255 // (This column can be optimized away later if there are no rule 256 // references to {bof}.) 257 // Add this column value (1 or 2) to the equivalent expression 258 // subtree for each UnicodeSet that contains the string {eof} 259 // Because {bof} and {eof} are not a characters in the normal sense, 260 // they doesn't affect the computation of ranges or TRIE. 261 262 String eofString = "eof"; 263 String bofString = "bof"; 264 265 for (RBBINode usetNode : fRB.fUSetNodes) { 266 UnicodeSet inputSet = usetNode.fInputSet; 267 if (inputSet.contains(eofString)) { 268 addValToSet(usetNode, 1); 269 } 270 if (inputSet.contains(bofString)) { 271 addValToSet(usetNode, 2); 272 fSawBOF = true; 273 } 274 } 275 276 277 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();} 278 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();} 279 280 281 //IntTrieBuilder(int aliasdata[], int maxdatalength, 282 // int initialvalue, int leadunitvalue, 283 // boolean latin1linear) 284 285 fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one) 286 100000, // Max Data Length 287 0, // Initial value for all code points 288 0, // Lead Surrogate unit value, 289 true); // Keep Latin 1 in separately. 290 291 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 292 fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true); 293 } 294 } 295 296 297 298 //----------------------------------------------------------------------------------- 299 // 300 // RBBIDataManipulate A little internal class needed only to wrap of the 301 // getFoldedValue() function needed for Trie table creation. 302 // 303 //----------------------------------------------------------------------------------- 304 class RBBIDataManipulate implements IntTrieBuilder.DataManipulate { 305 public int getFoldedValue(int start, int offset) { 306 int value; 307 int limit; 308 boolean [] inBlockZero = new boolean[1]; 309 310 limit = start + 0x400; 311 while(start<limit) { 312 value = fTrie.getValue(start, inBlockZero); 313 if (inBlockZero[0]) { 314 start += IntTrieBuilder.DATA_BLOCK_LENGTH; 315 } else if (value != 0) { 316 return offset | 0x08000; 317 } else { 318 ++start; 319 } 320 } 321 return 0; 322 } 323 } 324 RBBIDataManipulate dm = new RBBIDataManipulate(); 325 326 //----------------------------------------------------------------------------------- 327 // 328 // getTrieSize() Return the size that will be required to serialize the Trie. 329 // 330 //----------------------------------------------------------------------------------- 331 int getTrieSize() { 332 int size = 0; 333 try { 334 // The trie serialize function returns the size of the data written. 335 // null output stream says give size only, don't actually write anything. 336 size = fTrie.serialize(null, true, dm ); 337 } catch (IOException e) { 338 Assert.assrt (false); 339 } 340 return size; 341 } 342 343 344 //----------------------------------------------------------------------------------- 345 // 346 // serializeTrie() Write the serialized trie to an output stream 347 // 348 //----------------------------------------------------------------------------------- 349 void serializeTrie(OutputStream os) throws IOException { 350 fTrie.serialize(os, true, dm ); 351 } 352 353 //------------------------------------------------------------------------ 354 // 355 // addValToSets Add a runtime-mapped input value to each uset from a 356 // list of uset nodes. (val corresponds to a state table column.) 357 // For each of the original Unicode sets - which correspond 358 // directly to uset nodes - a logically equivalent expression 359 // is constructed in terms of the remapped runtime input 360 // symbol set. This function adds one runtime input symbol to 361 // a list of sets. 362 // 363 // The "logically equivalent expression" is the tree for an 364 // or-ing together of all of the symbols that go into the set. 365 // 366 //------------------------------------------------------------------------ 367 void addValToSets(List<RBBINode> sets, int val) { 368 for (RBBINode usetNode : sets) { 369 addValToSet(usetNode, val); 370 } 371 } 372 373 void addValToSet(RBBINode usetNode, int val) { 374 RBBINode leafNode = new RBBINode(RBBINode.leafChar); 375 leafNode.fVal = val; 376 if (usetNode.fLeftChild == null) { 377 usetNode.fLeftChild = leafNode; 378 leafNode.fParent = usetNode; 379 } else { 380 // There are already input symbols present for this set. 381 // Set up an OR node, with the previous stuff as the left child 382 // and the new value as the right child. 383 RBBINode orNode = new RBBINode(RBBINode.opOr); 384 orNode.fLeftChild = usetNode.fLeftChild; 385 orNode.fRightChild = leafNode; 386 orNode.fLeftChild.fParent = orNode; 387 orNode.fRightChild.fParent = orNode; 388 usetNode.fLeftChild = orNode; 389 orNode.fParent = usetNode; 390 } 391 } 392 393 394 //------------------------------------------------------------------------ 395 // 396 // getNumCharCategories 397 // 398 //------------------------------------------------------------------------ 399 int getNumCharCategories() { 400 return fGroupCount + 3; 401 } 402 403 404 //------------------------------------------------------------------------ 405 // 406 // sawBOF 407 // 408 //------------------------------------------------------------------------ 409 boolean sawBOF() { 410 return fSawBOF; 411 } 412 413 414 //------------------------------------------------------------------------ 415 // 416 // getFirstChar Given a runtime RBBI character category, find 417 // the first UChar32 that is in the set of chars 418 // in the category. 419 //------------------------------------------------------------------------ 420 int getFirstChar(int category) { 421 RangeDescriptor rlRange; 422 int retVal = -1; 423 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 424 if (rlRange.fNum == category) { 425 retVal = rlRange.fStartChar; 426 break; 427 } 428 } 429 return retVal; 430 } 431 432 433 434 //------------------------------------------------------------------------ 435 // 436 // printRanges A debugging function. 437 // dump out all of the range definitions. 438 // 439 //------------------------------------------------------------------------ 440 ///CLOVER:OFF 441 void printRanges() { 442 RangeDescriptor rlRange; 443 int i; 444 445 System.out.print("\n\n Nonoverlapping Ranges ...\n"); 446 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 447 System.out.print(" " + rlRange.fNum + " " + rlRange.fStartChar + "-" + rlRange.fEndChar); 448 449 for (i=0; i<rlRange.fIncludesSets.size(); i++) { 450 RBBINode usetNode = rlRange.fIncludesSets.get(i); 451 String setName = "anon"; 452 RBBINode setRef = usetNode.fParent; 453 if (setRef != null) { 454 RBBINode varRef = setRef.fParent; 455 if (varRef != null && varRef.fType == RBBINode.varRef) { 456 setName = varRef.fText; 457 } 458 } 459 System.out.print(setName); System.out.print(" "); 460 } 461 System.out.println(""); 462 } 463 } 464 ///CLOVER:ON 465 466 467 //------------------------------------------------------------------------ 468 // 469 // printRangeGroups A debugging function. 470 // dump out all of the range groups. 471 // 472 //------------------------------------------------------------------------ 473 ///CLOVER:OFF 474 void printRangeGroups() { 475 RangeDescriptor rlRange; 476 RangeDescriptor tRange; 477 int i; 478 int lastPrintedGroupNum = 0; 479 480 System.out.print("\nRanges grouped by Unicode Set Membership...\n"); 481 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 482 int groupNum = rlRange.fNum & 0xbfff; 483 if (groupNum > lastPrintedGroupNum) { 484 lastPrintedGroupNum = groupNum; 485 if (groupNum<10) {System.out.print(" ");} 486 System.out.print(groupNum + " "); 487 488 if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" <DICT> ");} 489 490 for (i=0; i<rlRange.fIncludesSets.size(); i++) { 491 RBBINode usetNode = rlRange.fIncludesSets.get(i); 492 String setName = "anon"; 493 RBBINode setRef = usetNode.fParent; 494 if (setRef != null) { 495 RBBINode varRef = setRef.fParent; 496 if (varRef != null && varRef.fType == RBBINode.varRef) { 497 setName = varRef.fText; 498 } 499 } 500 System.out.print(setName); System.out.print(" "); 501 } 502 503 i = 0; 504 for (tRange = rlRange; tRange != null; tRange = tRange.fNext) { 505 if (tRange.fNum == rlRange.fNum) { 506 if (i++ % 5 == 0) { 507 System.out.print("\n "); 508 } 509 RBBINode.printHex(tRange.fStartChar, -1); 510 System.out.print("-"); 511 RBBINode.printHex(tRange.fEndChar, 0); 512 } 513 } 514 System.out.print("\n"); 515 } 516 } 517 System.out.print("\n"); 518 } 519 ///CLOVER:ON 520 521 522 //------------------------------------------------------------------------ 523 // 524 // printSets A debugging function. 525 // dump out all of the set definitions. 526 // 527 //------------------------------------------------------------------------ 528 ///CLOVER:OFF 529 void printSets() { 530 int i; 531 System.out.print("\n\nUnicode Sets List\n------------------\n"); 532 for (i=0; i<fRB.fUSetNodes.size(); i++) { 533 RBBINode usetNode; 534 RBBINode setRef; 535 RBBINode varRef; 536 String setName; 537 538 usetNode = fRB.fUSetNodes.get(i); 539 540 //System.out.print(" " + i + " "); 541 RBBINode.printInt(2, i); 542 setName = "anonymous"; 543 setRef = usetNode.fParent; 544 if (setRef != null) { 545 varRef = setRef.fParent; 546 if (varRef != null && varRef.fType == RBBINode.varRef) { 547 setName = varRef.fText; 548 } 549 } 550 System.out.print(" " + setName); 551 System.out.print(" "); 552 System.out.print(usetNode.fText); 553 System.out.print("\n"); 554 if (usetNode.fLeftChild != null) { 555 usetNode.fLeftChild.printTree(true); 556 } 557 } 558 System.out.print("\n"); 559 } 560 ///CLOVER:ON 561} 562