1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2010-2011, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* file name: ucharstrie.h 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2010nov14 14* created by: Markus W. Scherer 15*/ 16 17#include "unicode/utypes.h" 18#include "unicode/appendable.h" 19#include "unicode/ucharstrie.h" 20#include "unicode/uobject.h" 21#include "unicode/utf16.h" 22#include "cmemory.h" 23#include "uassert.h" 24 25U_NAMESPACE_BEGIN 26 27UCharsTrie::~UCharsTrie() { 28 uprv_free(ownedArray_); 29} 30 31UStringTrieResult 32UCharsTrie::current() const { 33 const UChar *pos=pos_; 34 if(pos==NULL) { 35 return USTRINGTRIE_NO_MATCH; 36 } else { 37 int32_t node; 38 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? 39 valueResult(node) : USTRINGTRIE_NO_VALUE; 40 } 41} 42 43UStringTrieResult 44UCharsTrie::firstForCodePoint(UChar32 cp) { 45 return cp<=0xffff ? 46 first(cp) : 47 (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ? 48 next(U16_TRAIL(cp)) : 49 USTRINGTRIE_NO_MATCH); 50} 51 52UStringTrieResult 53UCharsTrie::nextForCodePoint(UChar32 cp) { 54 return cp<=0xffff ? 55 next(cp) : 56 (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ? 57 next(U16_TRAIL(cp)) : 58 USTRINGTRIE_NO_MATCH); 59} 60 61UStringTrieResult 62UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) { 63 // Branch according to the current unit. 64 if(length==0) { 65 length=*pos++; 66 } 67 ++length; 68 // The length of the branch is the number of units to select from. 69 // The data structure encodes a binary search. 70 while(length>kMaxBranchLinearSubNodeLength) { 71 if(uchar<*pos++) { 72 length>>=1; 73 pos=jumpByDelta(pos); 74 } else { 75 length=length-(length>>1); 76 pos=skipDelta(pos); 77 } 78 } 79 // Drop down to linear search for the last few units. 80 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 81 // and divides length by 2. 82 do { 83 if(uchar==*pos++) { 84 UStringTrieResult result; 85 int32_t node=*pos; 86 if(node&kValueIsFinal) { 87 // Leave the final value for getValue() to read. 88 result=USTRINGTRIE_FINAL_VALUE; 89 } else { 90 // Use the non-final value as the jump delta. 91 ++pos; 92 // int32_t delta=readValue(pos, node); 93 int32_t delta; 94 if(node<kMinTwoUnitValueLead) { 95 delta=node; 96 } else if(node<kThreeUnitValueLead) { 97 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++; 98 } else { 99 delta=(pos[0]<<16)|pos[1]; 100 pos+=2; 101 } 102 // end readValue() 103 pos+=delta; 104 node=*pos; 105 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 106 } 107 pos_=pos; 108 return result; 109 } 110 --length; 111 pos=skipValue(pos); 112 } while(length>1); 113 if(uchar==*pos++) { 114 pos_=pos; 115 int32_t node=*pos; 116 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 117 } else { 118 stop(); 119 return USTRINGTRIE_NO_MATCH; 120 } 121} 122 123UStringTrieResult 124UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) { 125 int32_t node=*pos++; 126 for(;;) { 127 if(node<kMinLinearMatch) { 128 return branchNext(pos, node, uchar); 129 } else if(node<kMinValueLead) { 130 // Match the first of length+1 units. 131 int32_t length=node-kMinLinearMatch; // Actual match length minus 1. 132 if(uchar==*pos++) { 133 remainingMatchLength_=--length; 134 pos_=pos; 135 return (length<0 && (node=*pos)>=kMinValueLead) ? 136 valueResult(node) : USTRINGTRIE_NO_VALUE; 137 } else { 138 // No match. 139 break; 140 } 141 } else if(node&kValueIsFinal) { 142 // No further matching units. 143 break; 144 } else { 145 // Skip intermediate value. 146 pos=skipNodeValue(pos, node); 147 node&=kNodeTypeMask; 148 } 149 } 150 stop(); 151 return USTRINGTRIE_NO_MATCH; 152} 153 154UStringTrieResult 155UCharsTrie::next(int32_t uchar) { 156 const UChar *pos=pos_; 157 if(pos==NULL) { 158 return USTRINGTRIE_NO_MATCH; 159 } 160 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 161 if(length>=0) { 162 // Remaining part of a linear-match node. 163 if(uchar==*pos++) { 164 remainingMatchLength_=--length; 165 pos_=pos; 166 int32_t node; 167 return (length<0 && (node=*pos)>=kMinValueLead) ? 168 valueResult(node) : USTRINGTRIE_NO_VALUE; 169 } else { 170 stop(); 171 return USTRINGTRIE_NO_MATCH; 172 } 173 } 174 return nextImpl(pos, uchar); 175} 176 177UStringTrieResult 178UCharsTrie::next(const UChar *s, int32_t sLength) { 179 if(sLength<0 ? *s==0 : sLength==0) { 180 // Empty input. 181 return current(); 182 } 183 const UChar *pos=pos_; 184 if(pos==NULL) { 185 return USTRINGTRIE_NO_MATCH; 186 } 187 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 188 for(;;) { 189 // Fetch the next input unit, if there is one. 190 // Continue a linear-match node without rechecking sLength<0. 191 int32_t uchar; 192 if(sLength<0) { 193 for(;;) { 194 if((uchar=*s++)==0) { 195 remainingMatchLength_=length; 196 pos_=pos; 197 int32_t node; 198 return (length<0 && (node=*pos)>=kMinValueLead) ? 199 valueResult(node) : USTRINGTRIE_NO_VALUE; 200 } 201 if(length<0) { 202 remainingMatchLength_=length; 203 break; 204 } 205 if(uchar!=*pos) { 206 stop(); 207 return USTRINGTRIE_NO_MATCH; 208 } 209 ++pos; 210 --length; 211 } 212 } else { 213 for(;;) { 214 if(sLength==0) { 215 remainingMatchLength_=length; 216 pos_=pos; 217 int32_t node; 218 return (length<0 && (node=*pos)>=kMinValueLead) ? 219 valueResult(node) : USTRINGTRIE_NO_VALUE; 220 } 221 uchar=*s++; 222 --sLength; 223 if(length<0) { 224 remainingMatchLength_=length; 225 break; 226 } 227 if(uchar!=*pos) { 228 stop(); 229 return USTRINGTRIE_NO_MATCH; 230 } 231 ++pos; 232 --length; 233 } 234 } 235 int32_t node=*pos++; 236 for(;;) { 237 if(node<kMinLinearMatch) { 238 UStringTrieResult result=branchNext(pos, node, uchar); 239 if(result==USTRINGTRIE_NO_MATCH) { 240 return USTRINGTRIE_NO_MATCH; 241 } 242 // Fetch the next input unit, if there is one. 243 if(sLength<0) { 244 if((uchar=*s++)==0) { 245 return result; 246 } 247 } else { 248 if(sLength==0) { 249 return result; 250 } 251 uchar=*s++; 252 --sLength; 253 } 254 if(result==USTRINGTRIE_FINAL_VALUE) { 255 // No further matching units. 256 stop(); 257 return USTRINGTRIE_NO_MATCH; 258 } 259 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 260 node=*pos++; 261 } else if(node<kMinValueLead) { 262 // Match length+1 units. 263 length=node-kMinLinearMatch; // Actual match length minus 1. 264 if(uchar!=*pos) { 265 stop(); 266 return USTRINGTRIE_NO_MATCH; 267 } 268 ++pos; 269 --length; 270 break; 271 } else if(node&kValueIsFinal) { 272 // No further matching units. 273 stop(); 274 return USTRINGTRIE_NO_MATCH; 275 } else { 276 // Skip intermediate value. 277 pos=skipNodeValue(pos, node); 278 node&=kNodeTypeMask; 279 } 280 } 281 } 282} 283 284const UChar * 285UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length, 286 UBool haveUniqueValue, int32_t &uniqueValue) { 287 while(length>kMaxBranchLinearSubNodeLength) { 288 ++pos; // ignore the comparison unit 289 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { 290 return NULL; 291 } 292 length=length-(length>>1); 293 pos=skipDelta(pos); 294 } 295 do { 296 ++pos; // ignore a comparison unit 297 // handle its value 298 int32_t node=*pos++; 299 UBool isFinal=(UBool)(node>>15); 300 node&=0x7fff; 301 int32_t value=readValue(pos, node); 302 pos=skipValue(pos, node); 303 if(isFinal) { 304 if(haveUniqueValue) { 305 if(value!=uniqueValue) { 306 return NULL; 307 } 308 } else { 309 uniqueValue=value; 310 haveUniqueValue=TRUE; 311 } 312 } else { 313 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { 314 return NULL; 315 } 316 haveUniqueValue=TRUE; 317 } 318 } while(--length>1); 319 return pos+1; // ignore the last comparison unit 320} 321 322UBool 323UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) { 324 int32_t node=*pos++; 325 for(;;) { 326 if(node<kMinLinearMatch) { 327 if(node==0) { 328 node=*pos++; 329 } 330 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); 331 if(pos==NULL) { 332 return FALSE; 333 } 334 haveUniqueValue=TRUE; 335 node=*pos++; 336 } else if(node<kMinValueLead) { 337 // linear-match node 338 pos+=node-kMinLinearMatch+1; // Ignore the match units. 339 node=*pos++; 340 } else { 341 UBool isFinal=(UBool)(node>>15); 342 int32_t value; 343 if(isFinal) { 344 value=readValue(pos, node&0x7fff); 345 } else { 346 value=readNodeValue(pos, node); 347 } 348 if(haveUniqueValue) { 349 if(value!=uniqueValue) { 350 return FALSE; 351 } 352 } else { 353 uniqueValue=value; 354 haveUniqueValue=TRUE; 355 } 356 if(isFinal) { 357 return TRUE; 358 } 359 pos=skipNodeValue(pos, node); 360 node&=kNodeTypeMask; 361 } 362 } 363} 364 365int32_t 366UCharsTrie::getNextUChars(Appendable &out) const { 367 const UChar *pos=pos_; 368 if(pos==NULL) { 369 return 0; 370 } 371 if(remainingMatchLength_>=0) { 372 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node. 373 return 1; 374 } 375 int32_t node=*pos++; 376 if(node>=kMinValueLead) { 377 if(node&kValueIsFinal) { 378 return 0; 379 } else { 380 pos=skipNodeValue(pos, node); 381 node&=kNodeTypeMask; 382 } 383 } 384 if(node<kMinLinearMatch) { 385 if(node==0) { 386 node=*pos++; 387 } 388 out.reserveAppendCapacity(++node); 389 getNextBranchUChars(pos, node, out); 390 return node; 391 } else { 392 // First unit of the linear-match node. 393 out.appendCodeUnit(*pos); 394 return 1; 395 } 396} 397 398void 399UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) { 400 while(length>kMaxBranchLinearSubNodeLength) { 401 ++pos; // ignore the comparison unit 402 getNextBranchUChars(jumpByDelta(pos), length>>1, out); 403 length=length-(length>>1); 404 pos=skipDelta(pos); 405 } 406 do { 407 out.appendCodeUnit(*pos++); 408 pos=skipValue(pos); 409 } while(--length>1); 410 out.appendCodeUnit(*pos); 411} 412 413U_NAMESPACE_END 414