1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* 6* Copyright (C) 2009-2016, International Business Machines 7* Corporation and others. All Rights Reserved. 8* 9******************************************************************************* 10* file name: n2builder.cpp 11* encoding: US-ASCII 12* tab size: 8 (not used) 13* indentation:4 14* 15* created on: 2009nov25 16* created by: Markus W. Scherer 17* 18* Builds Normalizer2 data and writes a binary .nrm file. 19* For the file format see source/common/normalizer2impl.h. 20*/ 21 22#include "unicode/utypes.h" 23#include "n2builder.h" 24 25#include <stdio.h> 26#include <stdlib.h> 27#include <string.h> 28#if U_HAVE_STD_STRING 29#include <vector> 30#endif 31#include "unicode/errorcode.h" 32#include "unicode/localpointer.h" 33#include "unicode/putil.h" 34#include "unicode/udata.h" 35#include "unicode/uniset.h" 36#include "unicode/unistr.h" 37#include "unicode/ustring.h" 38#include "charstr.h" 39#include "hash.h" 40#include "normalizer2impl.h" 41#include "toolutil.h" 42#include "unewdata.h" 43#include "utrie2.h" 44#include "uvectr32.h" 45#include "writesrc.h" 46 47#if !UCONFIG_NO_NORMALIZATION 48 49/* UDataInfo cf. udata.h */ 50static UDataInfo dataInfo={ 51 sizeof(UDataInfo), 52 0, 53 54 U_IS_BIG_ENDIAN, 55 U_CHARSET_FAMILY, 56 U_SIZEOF_UCHAR, 57 0, 58 59 { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 60 { 2, 0, 0, 0 }, /* formatVersion */ 61 { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ 62}; 63 64U_NAMESPACE_BEGIN 65 66class HangulIterator { 67public: 68 struct Range { 69 UChar32 start, limit; 70 uint16_t norm16; 71 }; 72 73 HangulIterator() : rangeIndex(0) {} 74 const Range *nextRange() { 75 if(rangeIndex<UPRV_LENGTHOF(ranges)) { 76 return ranges+rangeIndex++; 77 } else { 78 return NULL; 79 } 80 } 81 void reset() { rangeIndex=0; } 82private: 83 static const Range ranges[4]; 84 int32_t rangeIndex; 85}; 86 87const HangulIterator::Range HangulIterator::ranges[4]={ 88 { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 }, 89 { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT }, 90 // JAMO_T_BASE+1: not U+11A7 91 { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT }, 92 { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo 93}; 94 95struct CompositionPair { 96 CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 97 UChar32 trail, composite; 98}; 99 100struct Norm { 101 enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 102 103 UBool hasMapping() const { return mappingType>REMOVED; } 104 105 // Requires hasMapping() and well-formed mapping. 106 void setMappingCP() { 107 UChar32 c; 108 if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 109 mappingCP=c; 110 } else { 111 mappingCP=U_SENTINEL; 112 } 113 } 114 115 const CompositionPair *getCompositionPairs(int32_t &length) const { 116 if(compositions==NULL) { 117 length=0; 118 return NULL; 119 } else { 120 length=compositions->size()/2; 121 return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); 122 } 123 } 124 125 UnicodeString *mapping; 126 UnicodeString *rawMapping; // non-NULL if the mapping is further decomposed 127 UChar32 mappingCP; // >=0 if mapping to 1 code point 128 int32_t mappingPhase; 129 MappingType mappingType; 130 131 UVector32 *compositions; // (trail, composite) pairs 132 uint8_t cc; 133 UBool combinesBack; 134 UBool hasNoCompBoundaryAfter; 135 136 enum OffsetType { 137 OFFSET_NONE, 138 // Composition for back-combining character. Allowed, but not normally used. 139 OFFSET_MAYBE_YES, 140 // Composition for a starter that does not have a decomposition mapping. 141 OFFSET_YES_YES, 142 // Round-trip mapping & composition for a starter. 143 OFFSET_YES_NO_MAPPING_AND_COMPOSITION, 144 // Round-trip mapping for a starter that itself does not combine-forward. 145 OFFSET_YES_NO_MAPPING_ONLY, 146 // One-way mapping. 147 OFFSET_NO_NO, 148 // Delta for an algorithmic one-way mapping. 149 OFFSET_DELTA 150 }; 151 enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 }; 152 int32_t offset; 153}; 154 155class Normalizer2DBEnumerator { 156public: 157 Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {} 158 virtual ~Normalizer2DBEnumerator() {} 159 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0; 160 Normalizer2DBEnumerator *ptr() { return this; } 161protected: 162 Normalizer2DataBuilder &builder; 163}; 164 165U_CDECL_BEGIN 166 167static UBool U_CALLCONV 168enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 169 return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value); 170} 171 172U_CDECL_END 173 174Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 175 phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL), 176 norm16TrieLength(0) { 177 memset(unicodeVersion, 0, sizeof(unicodeVersion)); 178 normTrie=utrie2_open(0, 0, &errorCode); 179 normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); 180 norms=allocNorm(); // unused Norm struct at index 0 181 memset(indexes, 0, sizeof(indexes)); 182 memset(smallFCD, 0, sizeof(smallFCD)); 183} 184 185Normalizer2DataBuilder::~Normalizer2DataBuilder() { 186 utrie2_close(normTrie); 187 int32_t normsLength=utm_countItems(normMem); 188 for(int32_t i=1; i<normsLength; ++i) { 189 delete norms[i].mapping; 190 delete norms[i].rawMapping; 191 delete norms[i].compositions; 192 } 193 utm_close(normMem); 194 utrie2_close(norm16Trie); 195} 196 197void 198Normalizer2DataBuilder::setUnicodeVersion(const char *v) { 199 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 200 UVersionInfo version; 201 u_versionFromString(version, v); 202 if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && 203 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) 204 ) { 205 char buffer[U_MAX_VERSION_STRING_LENGTH]; 206 u_versionToString(unicodeVersion, buffer); 207 fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", 208 buffer, v); 209 exit(U_ILLEGAL_ARGUMENT_ERROR); 210 } 211 memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); 212} 213 214Norm *Normalizer2DataBuilder::allocNorm() { 215 Norm *p=(Norm *)utm_alloc(normMem); 216 norms=(Norm *)utm_getStart(normMem); // in case it got reallocated 217 return p; 218} 219 220/* get an existing Norm unit */ 221Norm *Normalizer2DataBuilder::getNorm(UChar32 c) { 222 uint32_t i=utrie2_get32(normTrie, c); 223 if(i==0) { 224 return NULL; 225 } 226 return norms+i; 227} 228 229const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { 230 return norms[utrie2_get32(normTrie, c)]; 231} 232 233/* 234 * get or create a Norm unit; 235 * get or create the intermediate trie entries for it as well 236 */ 237Norm *Normalizer2DataBuilder::createNorm(UChar32 c) { 238 uint32_t i=utrie2_get32(normTrie, c); 239 if(i!=0) { 240 return norms+i; 241 } else { 242 /* allocate Norm */ 243 Norm *p=allocNorm(); 244 IcuToolErrorCode errorCode("gennorm2/createNorm()"); 245 utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); 246 return p; 247 } 248} 249 250Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 251 if(p!=NULL) { 252 if(p->mappingType!=Norm::NONE) { 253 if( overrideHandling==OVERRIDE_NONE || 254 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 255 ) { 256 fprintf(stderr, 257 "error in gennorm2 phase %d: " 258 "not permitted to override mapping for U+%04lX from phase %d\n", 259 (int)phase, (long)c, (int)p->mappingPhase); 260 exit(U_INVALID_FORMAT_ERROR); 261 } 262 delete p->mapping; 263 p->mapping=NULL; 264 } 265 p->mappingPhase=phase; 266 } 267 return p; 268} 269 270void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 271 overrideHandling=oh; 272 ++phase; 273} 274 275void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 276 createNorm(c)->cc=cc; 277} 278 279uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { 280 return getNormRef(c).cc; 281} 282 283static UBool isWellFormed(const UnicodeString &s) { 284 UErrorCode errorCode=U_ZERO_ERROR; 285 u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); 286 return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 287} 288 289void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 290 if(!isWellFormed(m)) { 291 fprintf(stderr, 292 "error in gennorm2 phase %d: " 293 "illegal one-way mapping from U+%04lX to malformed string\n", 294 (int)phase, (long)c); 295 exit(U_INVALID_FORMAT_ERROR); 296 } 297 Norm *p=checkNormForMapping(createNorm(c), c); 298 p->mapping=new UnicodeString(m); 299 p->mappingType=Norm::ONE_WAY; 300 p->setMappingCP(); 301} 302 303void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 304 if(U_IS_SURROGATE(c)) { 305 fprintf(stderr, 306 "error in gennorm2 phase %d: " 307 "illegal round-trip mapping from surrogate code point U+%04lX\n", 308 (int)phase, (long)c); 309 exit(U_INVALID_FORMAT_ERROR); 310 } 311 if(!isWellFormed(m)) { 312 fprintf(stderr, 313 "error in gennorm2 phase %d: " 314 "illegal round-trip mapping from U+%04lX to malformed string\n", 315 (int)phase, (long)c); 316 exit(U_INVALID_FORMAT_ERROR); 317 } 318 int32_t numCP=u_countChar32(m.getBuffer(), m.length()); 319 if(numCP!=2) { 320 fprintf(stderr, 321 "error in gennorm2 phase %d: " 322 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 323 (int)phase, (long)c, (int)numCP); 324 exit(U_INVALID_FORMAT_ERROR); 325 } 326 Norm *p=checkNormForMapping(createNorm(c), c); 327 p->mapping=new UnicodeString(m); 328 p->mappingType=Norm::ROUND_TRIP; 329 p->mappingCP=U_SENTINEL; 330} 331 332void Normalizer2DataBuilder::removeMapping(UChar32 c) { 333 Norm *p=checkNormForMapping(getNorm(c), c); 334 if(p!=NULL) { 335 p->mappingType=Norm::REMOVED; 336 } 337} 338 339class CompositionBuilder : public Normalizer2DBEnumerator { 340public: 341 CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 342 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 343 builder.addComposition(start, end, value); 344 return TRUE; 345 } 346}; 347 348void 349Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) { 350 if(norms[value].mappingType==Norm::ROUND_TRIP) { 351 if(start!=end) { 352 fprintf(stderr, 353 "gennorm2 error: same round-trip mapping for " 354 "more than 1 code point U+%04lX..U+%04lX\n", 355 (long)start, (long)end); 356 exit(U_INVALID_FORMAT_ERROR); 357 } 358 if(norms[value].cc!=0) { 359 fprintf(stderr, 360 "gennorm2 error: " 361 "U+%04lX has a round-trip mapping and ccc!=0, " 362 "not possible in Unicode normalization\n", 363 (long)start); 364 exit(U_INVALID_FORMAT_ERROR); 365 } 366 // setRoundTripMapping() ensured that there are exactly two code points. 367 const UnicodeString &m=*norms[value].mapping; 368 UChar32 lead=m.char32At(0); 369 UChar32 trail=m.char32At(m.length()-1); 370 if(getCC(lead)!=0) { 371 fprintf(stderr, 372 "gennorm2 error: " 373 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 374 "not possible in Unicode normalization\n", 375 (long)start, (long)lead); 376 exit(U_INVALID_FORMAT_ERROR); 377 } 378 // Flag for trailing character. 379 createNorm(trail)->combinesBack=TRUE; 380 // Insert (trail, composite) pair into compositions list for the lead character. 381 IcuToolErrorCode errorCode("gennorm2/addComposition()"); 382 Norm *leadNorm=createNorm(lead); 383 UVector32 *compositions=leadNorm->compositions; 384 int32_t i; 385 if(compositions==NULL) { 386 compositions=leadNorm->compositions=new UVector32(errorCode); 387 i=0; // "insert" the first pair at index 0 388 } else { 389 // Insertion sort, and check for duplicate trail characters. 390 int32_t length; 391 const CompositionPair *pairs=leadNorm->getCompositionPairs(length); 392 for(i=0; i<length; ++i) { 393 if(trail==pairs[i].trail) { 394 fprintf(stderr, 395 "gennorm2 error: same round-trip mapping for " 396 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 397 (long)start, (long)lead, (long)trail); 398 exit(U_INVALID_FORMAT_ERROR); 399 } 400 if(trail<pairs[i].trail) { 401 break; 402 } 403 } 404 } 405 compositions->insertElementAt(trail, 2*i, errorCode); 406 compositions->insertElementAt(start, 2*i+1, errorCode); 407 } 408} 409 410UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, 411 uint8_t lowCC, uint8_t highCC) const { 412 if((highCC-lowCC)>=2) { 413 int32_t length; 414 const CompositionPair *pairs=norm.getCompositionPairs(length); 415 for(int32_t i=0; i<length; ++i) { 416 uint8_t trailCC=getCC(pairs[i].trail); 417 if(lowCC<trailCC && trailCC<highCC) { 418 return TRUE; 419 } 420 } 421 } 422 return FALSE; 423} 424 425UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const { 426 int32_t length; 427 const CompositionPair *pairs=norm.getCompositionPairs(length); 428 for(int32_t i=0; i<length; ++i) { 429 if(trail==pairs[i].trail) { 430 return pairs[i].composite; 431 } 432 if(trail<pairs[i].trail) { 433 break; 434 } 435 } 436 return U_SENTINEL; 437} 438 439class Decomposer : public Normalizer2DBEnumerator { 440public: 441 Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {} 442 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 443 didDecompose|=builder.decompose(start, end, value); 444 return TRUE; 445 } 446 UBool didDecompose; 447}; 448 449UBool 450Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { 451 if(norms[value].hasMapping()) { 452 Norm &norm=norms[value]; 453 const UnicodeString &m=*norm.mapping; 454 UnicodeString *decomposed=NULL; 455 const UChar *s=m.getBuffer(); 456 int32_t length=m.length(); 457 int32_t prev, i=0; 458 UChar32 c; 459 while(i<length) { 460 prev=i; 461 U16_NEXT(s, i, length, c); 462 if(start<=c && c<=end) { 463 fprintf(stderr, 464 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 465 (long)c); 466 exit(U_INVALID_FORMAT_ERROR); 467 } 468 const Norm &cNorm=getNormRef(c); 469 if(cNorm.hasMapping()) { 470 if(norm.mappingType==Norm::ROUND_TRIP) { 471 if(prev==0) { 472 if(cNorm.mappingType!=Norm::ROUND_TRIP) { 473 fprintf(stderr, 474 "gennorm2 error: " 475 "U+%04lX's round-trip mapping's starter " 476 "U+%04lX one-way-decomposes, " 477 "not possible in Unicode normalization\n", 478 (long)start, (long)c); 479 exit(U_INVALID_FORMAT_ERROR); 480 } 481 uint8_t myTrailCC=getCC(m.char32At(i)); 482 UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); 483 uint8_t cTrailCC=getCC(cTrailChar); 484 if(cTrailCC>myTrailCC) { 485 fprintf(stderr, 486 "gennorm2 error: " 487 "U+%04lX's round-trip mapping's starter " 488 "U+%04lX decomposes and the " 489 "inner/earlier tccc=%hu > outer/following tccc=%hu, " 490 "not possible in Unicode normalization\n", 491 (long)start, (long)c, 492 (short)cTrailCC, (short)myTrailCC); 493 exit(U_INVALID_FORMAT_ERROR); 494 } 495 } else { 496 fprintf(stderr, 497 "gennorm2 error: " 498 "U+%04lX's round-trip mapping's non-starter " 499 "U+%04lX decomposes, " 500 "not possible in Unicode normalization\n", 501 (long)start, (long)c); 502 exit(U_INVALID_FORMAT_ERROR); 503 } 504 } 505 if(decomposed==NULL) { 506 decomposed=new UnicodeString(m, 0, prev); 507 } 508 decomposed->append(*cNorm.mapping); 509 } else if(Hangul::isHangul(c)) { 510 UChar buffer[3]; 511 int32_t hangulLength=Hangul::decompose(c, buffer); 512 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { 513 fprintf(stderr, 514 "gennorm2 error: " 515 "U+%04lX's round-trip mapping's non-starter " 516 "U+%04lX decomposes, " 517 "not possible in Unicode normalization\n", 518 (long)start, (long)c); 519 exit(U_INVALID_FORMAT_ERROR); 520 } 521 if(decomposed==NULL) { 522 decomposed=new UnicodeString(m, 0, prev); 523 } 524 decomposed->append(buffer, hangulLength); 525 } else if(decomposed!=NULL) { 526 decomposed->append(m, prev, i-prev); 527 } 528 } 529 if(decomposed!=NULL) { 530 if(norm.rawMapping==NULL) { 531 // Remember the original mapping when decomposing recursively. 532 norm.rawMapping=norm.mapping; 533 } else { 534 delete norm.mapping; 535 } 536 norm.mapping=decomposed; 537 // Not norm.setMappingCP(); because the original mapping 538 // is most likely to be encodable as a delta. 539 return TRUE; 540 } 541 } 542 return FALSE; 543} 544 545class BuilderReorderingBuffer { 546public: 547 BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} 548 void reset() { 549 fLength=0; 550 fLastStarterIndex=-1; 551 fDidReorder=FALSE; 552 } 553 int32_t length() const { return fLength; } 554 UBool isEmpty() const { return fLength==0; } 555 int32_t lastStarterIndex() const { return fLastStarterIndex; } 556 UChar32 charAt(int32_t i) const { return fArray[i]>>8; } 557 uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } 558 UBool didReorder() const { return fDidReorder; } 559 void append(UChar32 c, uint8_t cc) { 560 if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { 561 if(cc==0) { 562 fLastStarterIndex=fLength; 563 } 564 fArray[fLength++]=(c<<8)|cc; 565 return; 566 } 567 // Let this character bubble back to its canonical order. 568 int32_t i=fLength-1; 569 while(i>fLastStarterIndex && ccAt(i)>cc) { 570 --i; 571 } 572 ++i; // after the last starter or prevCC<=cc 573 // Move this and the following characters forward one to make space. 574 for(int32_t j=fLength; i<j; --j) { 575 fArray[j]=fArray[j-1]; 576 } 577 fArray[i]=(c<<8)|cc; 578 ++fLength; 579 fDidReorder=TRUE; 580 } 581 void toString(UnicodeString &dest) { 582 dest.remove(); 583 for(int32_t i=0; i<fLength; ++i) { 584 dest.append(charAt(i)); 585 } 586 } 587 void setComposite(UChar32 composite, int32_t combMarkIndex) { 588 fArray[fLastStarterIndex]=composite<<8; 589 // Remove the combining mark that contributed to the composite. 590 --fLength; 591 while(combMarkIndex<fLength) { 592 fArray[combMarkIndex]=fArray[combMarkIndex+1]; 593 ++combMarkIndex; 594 } 595 } 596private: 597 int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 598 int32_t fLength; 599 int32_t fLastStarterIndex; 600 UBool fDidReorder; 601}; 602 603void 604Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) { 605 UnicodeString &m=*p->mapping; 606 int32_t length=m.length(); 607 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 608 return; // writeMapping() will complain about it and print the code point. 609 } 610 const UChar *s=m.getBuffer(); 611 int32_t i=0; 612 UChar32 c; 613 while(i<length) { 614 U16_NEXT(s, i, length, c); 615 buffer.append(c, getCC(c)); 616 } 617 if(buffer.didReorder()) { 618 buffer.toString(m); 619 } 620} 621 622/* 623 * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter(). 624 * A starter character with a mapping does not have a composition boundary after it 625 * if the character itself combines-forward (which is tested by the caller of this function), 626 * or it is deleted (mapped to the empty string), 627 * or its mapping contains no starter, 628 * or the last starter combines-forward. 629 */ 630UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) { 631 if(buffer.isEmpty()) { 632 return TRUE; // maps-to-empty-string is no boundary of any kind 633 } 634 int32_t lastStarterIndex=buffer.lastStarterIndex(); 635 if(lastStarterIndex<0) { 636 return TRUE; // no starter 637 } 638 UChar32 starter=buffer.charAt(lastStarterIndex); 639 if( Hangul::isJamoL(starter) || 640 (Hangul::isJamoV(starter) && 641 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1))) 642 ) { 643 // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 644 // otherwise it is blocked. 645 return lastStarterIndex==buffer.length()-1; 646 } 647 // Note: There can be no Hangul syllable in the fully decomposed mapping. 648 const Norm *starterNorm=&getNormRef(starter); 649 if(starterNorm->compositions==NULL) { 650 return FALSE; // the last starter does not combine forward 651 } 652 // Compose as far as possible, and see if further compositions are possible. 653 uint8_t prevCC=0; 654 for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) { 655 uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter 656 if(combinesWithCCBetween(*starterNorm, prevCC, cc)) { 657 return TRUE; 658 } 659 if( prevCC<cc && 660 (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0 661 ) { 662 buffer.setComposite(starter, combMarkIndex); 663 starterNorm=&getNormRef(starter); 664 if(starterNorm->compositions==NULL) { 665 return FALSE; // the composite does not combine further 666 } 667 } else { 668 prevCC=cc; 669 ++combMarkIndex; 670 } 671 } 672 // TRUE if the final, forward-combining starter is at the end. 673 return prevCC==0; 674} 675 676// Requires p->hasMapping(). 677// Returns the offset of the "first unit" from the beginning of the extraData for c. 678// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word. 679int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) { 680 UnicodeString &m=*p->mapping; 681 int32_t length=m.length(); 682 if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 683 fprintf(stderr, 684 "gennorm2 error: " 685 "mapping for U+%04lX longer than maximum of %d\n", 686 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 687 exit(U_INVALID_FORMAT_ERROR); 688 } 689 int32_t leadCC, trailCC; 690 if(length==0) { 691 leadCC=trailCC=0; 692 } else { 693 leadCC=getCC(m.char32At(0)); 694 trailCC=getCC(m.char32At(length-1)); 695 } 696 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) { 697 fprintf(stderr, 698 "gennorm2 error: " 699 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n", 700 (long)c); 701 exit(U_INVALID_FORMAT_ERROR); 702 } 703 // Write small-FCD data. 704 if((leadCC|trailCC)!=0) { 705 UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 706 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 707 } 708 // Write the mapping & raw mapping extraData. 709 int32_t firstUnit=length|(trailCC<<8); 710 int32_t preMappingLength=0; 711 if(p->rawMapping!=NULL) { 712 UnicodeString &rm=*p->rawMapping; 713 int32_t rmLength=rm.length(); 714 if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { 715 fprintf(stderr, 716 "gennorm2 error: " 717 "raw mapping for U+%04lX longer than maximum of %d\n", 718 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 719 exit(U_INVALID_FORMAT_ERROR); 720 } 721 UChar rm0=rm.charAt(0); 722 if( rmLength==length-1 && 723 // 99: overlong substring lengths get pinned to remainder lengths anyway 724 0==rm.compare(1, 99, m, 2, 99) && 725 rm0>Normalizer2Impl::MAPPING_LENGTH_MASK 726 ) { 727 // Compression: 728 // rawMapping=rm0+mapping.substring(2) -> store only rm0 729 // 730 // The raw mapping is the same as the final mapping after replacing 731 // the final mapping's first two code units with the raw mapping's first one. 732 // In this case, we store only that first unit, rm0. 733 // This helps with a few hundred mappings. 734 dataString.append(rm0); 735 preMappingLength=1; 736 } else { 737 // Store the raw mapping with its length. 738 dataString.append(rm); 739 dataString.append((UChar)rmLength); 740 preMappingLength=rmLength+1; 741 } 742 firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; 743 } 744 int32_t cccLccc=p->cc|(leadCC<<8); 745 if(cccLccc!=0) { 746 dataString.append((UChar)cccLccc); 747 ++preMappingLength; 748 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; 749 } 750 if(p->hasNoCompBoundaryAfter) { 751 firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; 752 } 753 dataString.append((UChar)firstUnit); 754 dataString.append(m); 755 return preMappingLength; 756} 757 758// Requires p->compositions!=NULL. 759void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) { 760 if(p->cc!=0) { 761 fprintf(stderr, 762 "gennorm2 error: " 763 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", 764 (long)c); 765 exit(U_INVALID_FORMAT_ERROR); 766 } 767 int32_t length; 768 const CompositionPair *pairs=p->getCompositionPairs(length); 769 for(int32_t i=0; i<length; ++i) { 770 const CompositionPair &pair=pairs[i]; 771 // 22 bits for the composite character and whether it combines forward. 772 UChar32 compositeAndFwd=pair.composite<<1; 773 if(getNormRef(pair.composite).compositions!=NULL) { 774 compositeAndFwd|=1; // The composite character also combines-forward. 775 } 776 // Encode most pairs in two units and some in three. 777 int32_t firstUnit, secondUnit, thirdUnit; 778 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { 779 if(compositeAndFwd<=0xffff) { 780 firstUnit=pair.trail<<1; 781 secondUnit=compositeAndFwd; 782 thirdUnit=-1; 783 } else { 784 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; 785 secondUnit=compositeAndFwd>>16; 786 thirdUnit=compositeAndFwd; 787 } 788 } else { 789 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ 790 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| 791 Normalizer2Impl::COMP_1_TRIPLE; 792 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| 793 (compositeAndFwd>>16); 794 thirdUnit=compositeAndFwd; 795 } 796 // Set the high bit of the first unit if this is the last composition pair. 797 if(i==(length-1)) { 798 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; 799 } 800 dataString.append((UChar)firstUnit).append((UChar)secondUnit); 801 if(thirdUnit>=0) { 802 dataString.append((UChar)thirdUnit); 803 } 804 } 805} 806 807class ExtraDataWriter : public Normalizer2DBEnumerator { 808public: 809 ExtraDataWriter(Normalizer2DataBuilder &b) : 810 Normalizer2DBEnumerator(b), 811 yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions 812 yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data 813 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 814 if(value!=0) { 815 if(start!=end) { 816 fprintf(stderr, 817 "gennorm2 error: unexpected shared data for " 818 "multiple code points U+%04lX..U+%04lX\n", 819 (long)start, (long)end); 820 exit(U_INTERNAL_PROGRAM_ERROR); 821 } 822 builder.writeExtraData(start, value, *this); 823 } 824 return TRUE; 825 } 826 UnicodeString maybeYesCompositions; 827 UnicodeString yesYesCompositions; 828 UnicodeString yesNoMappingsAndCompositions; 829 UnicodeString yesNoMappingsOnly; 830 UnicodeString noNoMappings; 831 Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode. 832}; 833 834void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) { 835 Norm *p=norms+value; 836 if(!p->hasMapping()) { 837 // Write small-FCD data. 838 // There is similar code in writeMapping() for characters that do have a mapping. 839 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) { 840 fprintf(stderr, 841 "gennorm2 error: " 842 "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n", 843 (long)c); 844 exit(U_INVALID_FORMAT_ERROR); 845 } 846 if(p->cc!=0) { 847 UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 848 smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 849 } 850 } 851 if(p->combinesBack) { 852 if(p->hasMapping()) { 853 fprintf(stderr, 854 "gennorm2 error: " 855 "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n", 856 (long)c); 857 exit(U_INVALID_FORMAT_ERROR); 858 } 859 if(p->compositions!=NULL) { 860 p->offset= 861 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)| 862 Norm::OFFSET_MAYBE_YES; 863 writeCompositions(c, p, writer.maybeYesCompositions); 864 } 865 } else if(!p->hasMapping()) { 866 if(p->compositions!=NULL) { 867 p->offset= 868 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)| 869 Norm::OFFSET_YES_YES; 870 writeCompositions(c, p, writer.yesYesCompositions); 871 } 872 } else if(p->mappingType==Norm::ROUND_TRIP) { 873 if(p->compositions!=NULL) { 874 int32_t offset=writer.yesNoMappingsAndCompositions.length()+ 875 writeMapping(c, p, writer.yesNoMappingsAndCompositions); 876 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION; 877 writeCompositions(c, p, writer.yesNoMappingsAndCompositions); 878 } else { 879 int32_t offset=writer.yesNoMappingsOnly.length()+ 880 writeMapping(c, p, writer.yesNoMappingsOnly); 881 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY; 882 } 883 } else /* one-way */ { 884 if(p->compositions!=NULL) { 885 fprintf(stderr, 886 "gennorm2 error: " 887 "U+%04lX combines-forward and has a one-way mapping, " 888 "not possible in Unicode normalization\n", 889 (long)c); 890 exit(U_INVALID_FORMAT_ERROR); 891 } 892 if(p->cc==0 && optimization!=OPTIMIZE_FAST) { 893 // Try a compact, algorithmic encoding. 894 // Only for ccc=0, because we can't store additional information 895 // and we do not recursively follow an algorithmic encoding for access to the ccc. 896 // 897 // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding 898 // if the mappingCP decomposes further, to ensure that there is a place to store it. 899 // We want to see that the final mapping does not have exactly 1 code point, 900 // or else we would have to recursively ensure that the final mapping is stored 901 // in normal extraData. 902 if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) { 903 int32_t delta=p->mappingCP-c; 904 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { 905 p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA; 906 } 907 } 908 } 909 if(p->offset==0) { 910 int32_t oldNoNoLength=writer.noNoMappings.length(); 911 int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings); 912 UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength); 913 int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); 914 if(previousOffset!=0) { 915 // Duplicate, remove the new units and point to the old ones. 916 writer.noNoMappings.truncate(oldNoNoLength); 917 p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 918 } else { 919 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found". 920 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); 921 writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode); 922 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 923 } 924 } 925 } 926} 927 928class Norm16Writer : public Normalizer2DBEnumerator { 929public: 930 Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 931 virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 932 builder.writeNorm16(start, end, value); 933 return TRUE; 934 } 935}; 936 937void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) { 938 if(value!=0) { 939 const Norm *p=norms+value; 940 int32_t offset=p->offset>>Norm::OFFSET_SHIFT; 941 int32_t norm16=0; 942 UBool isDecompNo=FALSE; 943 UBool isCompNoMaybe=FALSE; 944 switch(p->offset&Norm::OFFSET_MASK) { 945 case Norm::OFFSET_NONE: 946 // No mapping, no compositions list. 947 if(p->combinesBack) { 948 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; 949 isDecompNo=(UBool)(p->cc!=0); 950 isCompNoMaybe=TRUE; 951 } else if(p->cc!=0) { 952 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; 953 isDecompNo=isCompNoMaybe=TRUE; 954 } 955 break; 956 case Norm::OFFSET_MAYBE_YES: 957 norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; 958 isCompNoMaybe=TRUE; 959 break; 960 case Norm::OFFSET_YES_YES: 961 norm16=offset; 962 break; 963 case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION: 964 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; 965 isDecompNo=TRUE; 966 break; 967 case Norm::OFFSET_YES_NO_MAPPING_ONLY: 968 norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset; 969 isDecompNo=TRUE; 970 break; 971 case Norm::OFFSET_NO_NO: 972 norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; 973 isDecompNo=isCompNoMaybe=TRUE; 974 break; 975 case Norm::OFFSET_DELTA: 976 norm16=getCenterNoNoDelta()+offset; 977 isDecompNo=isCompNoMaybe=TRUE; 978 break; 979 default: // Should not occur. 980 exit(U_INTERNAL_PROGRAM_ERROR); 981 } 982 IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 983 utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); 984 if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 985 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 986 } 987 if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 988 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 989 } 990 } 991} 992 993void Normalizer2DataBuilder::setHangulData() { 994 HangulIterator hi; 995 const HangulIterator::Range *range; 996 // Check that none of the Hangul/Jamo code points have data. 997 while((range=hi.nextRange())!=NULL) { 998 for(UChar32 c=range->start; c<range->limit; ++c) { 999 if(utrie2_get32(norm16Trie, c)!=0) { 1000 fprintf(stderr, 1001 "gennorm2 error: " 1002 "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 1003 (long)c); 1004 exit(U_INVALID_FORMAT_ERROR); 1005 } 1006 } 1007 } 1008 // Set data for algorithmic runtime handling. 1009 IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 1010 hi.reset(); 1011 while((range=hi.nextRange())!=NULL) { 1012 uint16_t norm16=range->norm16; 1013 if(norm16==0) { 1014 norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo 1015 if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 1016 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start; 1017 } 1018 } else { 1019 if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes 1020 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start; 1021 } 1022 } 1023 utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode); 1024 errorCode.assertSuccess(); 1025 } 1026} 1027 1028U_CDECL_BEGIN 1029 1030static UBool U_CALLCONV 1031enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 1032 uint32_t *pMaxValue=(uint32_t *)context; 1033 if(value>*pMaxValue) { 1034 *pMaxValue=value; 1035 } 1036 return TRUE; 1037} 1038 1039U_CDECL_END 1040 1041void Normalizer2DataBuilder::processData() { 1042 IcuToolErrorCode errorCode("gennorm2/processData()"); 1043 norm16Trie=utrie2_open(0, 0, errorCode); 1044 errorCode.assertSuccess(); 1045 1046 utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr()); 1047 1048 Decomposer decomposer(*this); 1049 do { 1050 decomposer.didDecompose=FALSE; 1051 utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); 1052 } while(decomposer.didDecompose); 1053 1054 BuilderReorderingBuffer buffer; 1055 int32_t normsLength=utm_countItems(normMem); 1056 for(int32_t i=1; i<normsLength; ++i) { 1057 // Set the hasNoCompBoundaryAfter flag for use by the last code branch 1058 // in Normalizer2Impl::hasCompBoundaryAfter(). 1059 // For details see the comments on hasNoCompBoundaryAfter(buffer). 1060 const Norm &norm=norms[i]; 1061 if(norm.hasMapping()) { 1062 if(norm.compositions!=NULL) { 1063 norms[i].hasNoCompBoundaryAfter=TRUE; 1064 } else { 1065 buffer.reset(); 1066 reorder(norms+i, buffer); 1067 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); 1068 } 1069 } 1070 } 1071 1072 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 1073 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 1074 1075 ExtraDataWriter extraDataWriter(*this); 1076 utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter); 1077 1078 extraData=extraDataWriter.maybeYesCompositions; 1079 extraData.append(extraDataWriter.yesYesCompositions). 1080 append(extraDataWriter.yesNoMappingsAndCompositions). 1081 append(extraDataWriter.yesNoMappingsOnly). 1082 append(extraDataWriter.noNoMappings); 1083 // Pad to even length for 4-byte alignment of following data. 1084 if(extraData.length()&1) { 1085 extraData.append((UChar)0); 1086 } 1087 1088 indexes[Normalizer2Impl::IX_MIN_YES_NO]= 1089 extraDataWriter.yesYesCompositions.length(); 1090 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]= 1091 indexes[Normalizer2Impl::IX_MIN_YES_NO]+ 1092 extraDataWriter.yesNoMappingsAndCompositions.length(); 1093 indexes[Normalizer2Impl::IX_MIN_NO_NO]= 1094 indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+ 1095 extraDataWriter.yesNoMappingsOnly.length(); 1096 indexes[Normalizer2Impl::IX_LIMIT_NO_NO]= 1097 indexes[Normalizer2Impl::IX_MIN_NO_NO]+ 1098 extraDataWriter.noNoMappings.length(); 1099 indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= 1100 Normalizer2Impl::MIN_NORMAL_MAYBE_YES- 1101 extraDataWriter.maybeYesCompositions.length(); 1102 1103 int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA; 1104 if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 1105 fprintf(stderr, 1106 "gennorm2 error: " 1107 "data structure overflow, too much mapping composition data\n"); 1108 exit(U_BUFFER_OVERFLOW_ERROR); 1109 } 1110 1111 utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); 1112 1113 setHangulData(); 1114 1115 // Look for the "worst" norm16 value of any supplementary code point 1116 // corresponding to a lead surrogate, and set it as that surrogate's value. 1117 // Enables quick check inner loops to look at only code units. 1118 // 1119 // We could be more sophisticated: 1120 // We could collect a bit set for whether there are values in the different 1121 // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 1122 // and select the best value that only breaks the composition and/or decomposition 1123 // inner loops if necessary. 1124 // However, that seems like overkill for an optimization for supplementary characters. 1125 for(UChar lead=0xd800; lead<0xdc00; ++lead) { 1126 uint32_t maxValue=utrie2_get32(norm16Trie, lead); 1127 utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue); 1128 if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && 1129 maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] 1130 ) { 1131 // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 1132 // Otherwise it might end up at something like JAMO_VT which stays in 1133 // the inner decomposition quick check loop. 1134 maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; 1135 } 1136 utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode); 1137 } 1138 1139 // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 1140 // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 1141 // which is harmless. 1142 // As a result, the minimum code points are always BMP code points. 1143 int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 1144 if(minCP>=0x10000) { 1145 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 1146 } 1147 minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 1148 if(minCP>=0x10000) { 1149 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 1150 } 1151 1152 utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); 1153 norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); 1154 if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 1155 fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", 1156 errorCode.errorName()); 1157 exit(errorCode.reset()); 1158 } 1159 errorCode.reset(); 1160 1161 int32_t offset=(int32_t)sizeof(indexes); 1162 indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 1163 offset+=norm16TrieLength; 1164 indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 1165 offset+=extraData.length()*2; 1166 indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; 1167 offset+=sizeof(smallFCD); 1168 int32_t totalSize=offset; 1169 for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 1170 indexes[i]=totalSize; 1171 } 1172 1173 if(beVerbose) { 1174 printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); 1175 printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); 1176 printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); 1177 printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); 1178 printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); 1179 printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); 1180 printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); 1181 printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); 1182 printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); 1183 printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 1184 printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); 1185 } 1186 1187 UVersionInfo nullVersion={ 0, 0, 0, 0 }; 1188 if(0==memcmp(nullVersion, unicodeVersion, 4)) { 1189 u_versionFromString(unicodeVersion, U_UNICODE_VERSION); 1190 } 1191 memcpy(dataInfo.dataVersion, unicodeVersion, 4); 1192} 1193 1194void Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 1195 processData(); 1196 1197 IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 1198 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1199 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1200 errorCode.assertSuccess(); 1201 1202 UNewDataMemory *pData= 1203 udata_create(NULL, NULL, filename, &dataInfo, 1204 haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); 1205 if(errorCode.isFailure()) { 1206 fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 1207 filename, errorCode.errorName()); 1208 exit(errorCode.reset()); 1209 } 1210 udata_writeBlock(pData, indexes, sizeof(indexes)); 1211 udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); 1212 udata_writeUString(pData, extraData.getBuffer(), extraData.length()); 1213 udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); 1214 int32_t writtenSize=udata_finish(pData, errorCode); 1215 if(errorCode.isFailure()) { 1216 fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 1217 exit(errorCode.reset()); 1218 } 1219 int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 1220 if(writtenSize!=totalSize) { 1221 fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 1222 (long)writtenSize, (long)totalSize); 1223 exit(U_INTERNAL_PROGRAM_ERROR); 1224 } 1225} 1226 1227void 1228Normalizer2DataBuilder::writeCSourceFile(const char *filename) { 1229 processData(); 1230 1231 IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); 1232 const char *basename=findBasename(filename); 1233 CharString path(filename, (int32_t)(basename-filename), errorCode); 1234 CharString dataName(basename, errorCode); 1235 const char *extension=strrchr(basename, '.'); 1236 if(extension!=NULL) { 1237 dataName.truncate((int32_t)(extension-basename)); 1238 } 1239 errorCode.assertSuccess(); 1240 1241 LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1242 utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1243 errorCode.assertSuccess(); 1244 1245 FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp"); 1246 if(f==NULL) { 1247 fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n", 1248 filename); 1249 exit(U_FILE_ACCESS_ERROR); 1250 return; 1251 } 1252 fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f); 1253 char line[100]; 1254 sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data()); 1255 usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n"); 1256 sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data()); 1257 usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n"); 1258 sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", 1259 dataName.data()); 1260 usrc_writeArray(f, 1261 line, 1262 indexes, 32, Normalizer2Impl::IX_COUNT, 1263 "\n};\n\n"); 1264 sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data()); 1265 usrc_writeUTrie2Arrays(f, 1266 line, NULL, 1267 norm16Trie, 1268 "\n};\n\n"); 1269 sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data()); 1270 usrc_writeArray(f, 1271 line, 1272 extraData.getBuffer(), 16, extraData.length(), 1273 "\n};\n\n"); 1274 sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data()); 1275 usrc_writeArray(f, 1276 line, 1277 smallFCD, 8, sizeof(smallFCD), 1278 "\n};\n\n"); 1279 /*fputs( // TODO 1280 "static const UCaseProps %s_singleton={\n" 1281 " NULL,\n" 1282 " %s_indexes,\n" 1283 " %s_extraData,\n" 1284 " %s_smallFCD,\n", 1285 f);*/ 1286 sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data()); 1287 char line2[100]; 1288 sprintf(line2, "%s_trieIndex", dataName.data()); 1289 usrc_writeUTrie2Struct(f, 1290 line, 1291 norm16Trie, line2, NULL, 1292 "};\n"); 1293 fputs("\n#endif // INCLUDED_FROM_NORMALIZER2_CPP\n", f); 1294 fclose(f); 1295} 1296 1297U_NAMESPACE_END 1298 1299#endif /* #if !UCONFIG_NO_NORMALIZATION */ 1300 1301/* 1302 * Hey, Emacs, please set the following: 1303 * 1304 * Local Variables: 1305 * indent-tabs-mode: nil 1306 * End: 1307 */ 1308