150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* 250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho******************************************************************************* 350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 4f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius* Copyright (C) 2009-2014, International Business Machines 550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Corporation and others. All Rights Reserved. 650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho******************************************************************************* 850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* file name: n2builder.cpp 950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* encoding: US-ASCII 1050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* tab size: 8 (not used) 1150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* indentation:4 1250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 1350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* created on: 2009nov25 1450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* created by: Markus W. Scherer 1550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* 1650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* Builds Normalizer2 data and writes a binary .nrm file. 1750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho* For the file format see source/common/normalizer2impl.h. 1850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho*/ 1950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/utypes.h" 2127f654740f2a26ad62a5c155af9199af9e69b889claireho#include "n2builder.h" 2250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 2350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include <stdio.h> 2450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include <stdlib.h> 2550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include <string.h> 2650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if U_HAVE_STD_STRING 2750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include <vector> 2850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif 2950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/errorcode.h" 3050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/localpointer.h" 3150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/putil.h" 3250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/udata.h" 3350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/uniset.h" 3450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/unistr.h" 3550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unicode/ustring.h" 36f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "charstr.h" 3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "hash.h" 3850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "normalizer2impl.h" 3950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "toolutil.h" 4050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "unewdata.h" 4150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "utrie2.h" 4227f654740f2a26ad62a5c155af9199af9e69b889claireho#include "uvectr32.h" 43f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius#include "writesrc.h" 4450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 4550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#if !UCONFIG_NO_NORMALIZATION 4650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 4750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* UDataInfo cf. udata.h */ 4850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UDataInfo dataInfo={ 4950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho sizeof(UDataInfo), 5050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 0, 5150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 5250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_IS_BIG_ENDIAN, 5350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_CHARSET_FAMILY, 5450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U_SIZEOF_UCHAR, 5550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 0, 5650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 5750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */ 5883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius { 2, 0, 0, 0 }, /* formatVersion */ 5950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { 5, 2, 0, 0 } /* dataVersion (Unicode version) */ 6050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 6150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 6250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_BEGIN 6350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 6450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass HangulIterator { 6550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 6650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho struct Range { 6750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 start, limit; 6850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16; 6950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 7050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 7150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho HangulIterator() : rangeIndex(0) {} 7250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Range *nextRange() { 73f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(rangeIndex<UPRV_LENGTHOF(ranges)) { 7450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return ranges+rangeIndex++; 7550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 7650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return NULL; 7750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 7850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 7950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void reset() { rangeIndex=0; } 8050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprivate: 8150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho static const Range ranges[4]; 8250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t rangeIndex; 8350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 8450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 8550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst HangulIterator::Range HangulIterator::ranges[4]={ 8650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 }, 8750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT }, 8850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // JAMO_T_BASE+1: not U+11A7 8950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT }, 9050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo 9150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 9250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 9350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostruct CompositionPair { 9450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} 9550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 trail, composite; 9650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 9750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 9850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostruct Norm { 9950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; 10050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 10150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool hasMapping() const { return mappingType>REMOVED; } 10250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 10350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Requires hasMapping() and well-formed mapping. 10450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void setMappingCP() { 10550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c; 10650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { 10750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho mappingCP=c; 10850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 10950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho mappingCP=U_SENTINEL; 11050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 11150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 11250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 11327f654740f2a26ad62a5c155af9199af9e69b889claireho const CompositionPair *getCompositionPairs(int32_t &length) const { 11427f654740f2a26ad62a5c155af9199af9e69b889claireho if(compositions==NULL) { 11527f654740f2a26ad62a5c155af9199af9e69b889claireho length=0; 11627f654740f2a26ad62a5c155af9199af9e69b889claireho return NULL; 11727f654740f2a26ad62a5c155af9199af9e69b889claireho } else { 11827f654740f2a26ad62a5c155af9199af9e69b889claireho length=compositions->size()/2; 11927f654740f2a26ad62a5c155af9199af9e69b889claireho return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); 12027f654740f2a26ad62a5c155af9199af9e69b889claireho } 12127f654740f2a26ad62a5c155af9199af9e69b889claireho } 12227f654740f2a26ad62a5c155af9199af9e69b889claireho 12350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString *mapping; 12483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UnicodeString *rawMapping; // non-NULL if the mapping is further decomposed 12550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 mappingCP; // >=0 if mapping to 1 code point 12650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t mappingPhase; 12750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho MappingType mappingType; 12850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 12927f654740f2a26ad62a5c155af9199af9e69b889claireho UVector32 *compositions; // (trail, composite) pairs 13050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t cc; 13150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool combinesBack; 13250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool hasNoCompBoundaryAfter; 13350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 13450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum OffsetType { 13583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius OFFSET_NONE, 13683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Composition for back-combining character. Allowed, but not normally used. 13783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius OFFSET_MAYBE_YES, 13883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Composition for a starter that does not have a decomposition mapping. 13983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius OFFSET_YES_YES, 14083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Round-trip mapping & composition for a starter. 14183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius OFFSET_YES_NO_MAPPING_AND_COMPOSITION, 14283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Round-trip mapping for a starter that itself does not combine-forward. 14383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius OFFSET_YES_NO_MAPPING_ONLY, 14483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // One-way mapping. 14583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius OFFSET_NO_NO, 14683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Delta for an algorithmic one-way mapping. 14750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho OFFSET_DELTA 14850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho }; 14950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 }; 15050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t offset; 15150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 15250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 15350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass Normalizer2DBEnumerator { 15450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 15550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {} 15650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho virtual ~Normalizer2DBEnumerator() {} 15750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0; 15850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Normalizer2DBEnumerator *ptr() { return this; } 15950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprotected: 16050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Normalizer2DataBuilder &builder; 16150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 16250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 16350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_BEGIN 16450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 16550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UBool U_CALLCONV 16650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoenumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) { 16750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value); 16850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 16950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 17050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_END 17150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 17250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) : 173f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL), 174f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius norm16TrieLength(0) { 17550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho memset(unicodeVersion, 0, sizeof(unicodeVersion)); 17650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho normTrie=utrie2_open(0, 0, &errorCode); 17750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm)); 17850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norms=allocNorm(); // unused Norm struct at index 0 17950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho memset(indexes, 0, sizeof(indexes)); 18083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius memset(smallFCD, 0, sizeof(smallFCD)); 18150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 18250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 18350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::~Normalizer2DataBuilder() { 18450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_close(normTrie); 18550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t normsLength=utm_countItems(normMem); 18650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(int32_t i=1; i<normsLength; ++i) { 18750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho delete norms[i].mapping; 18883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius delete norms[i].rawMapping; 18950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho delete norms[i].compositions; 19050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 19150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utm_close(normMem); 19250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_close(norm16Trie); 19350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 19450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 19550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid 19650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::setUnicodeVersion(const char *v) { 19783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UVersionInfo nullVersion={ 0, 0, 0, 0 }; 19883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UVersionInfo version; 19983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius u_versionFromString(version, v); 20083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) && 20183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius 0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH) 20283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius ) { 20383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius char buffer[U_MAX_VERSION_STRING_LENGTH]; 20483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius u_versionToString(unicodeVersion, buffer); 20583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n", 20683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius buffer, v); 20783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius exit(U_ILLEGAL_ARGUMENT_ERROR); 20883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 20983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH); 21050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 21150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 21250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNorm *Normalizer2DataBuilder::allocNorm() { 21350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm *p=(Norm *)utm_alloc(normMem); 21450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norms=(Norm *)utm_getStart(normMem); // in case it got reallocated 21550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return p; 21650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 21750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 21850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* get an existing Norm unit */ 21950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNorm *Normalizer2DataBuilder::getNorm(UChar32 c) { 22050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint32_t i=utrie2_get32(normTrie, c); 22150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(i==0) { 22250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return NULL; 22350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 22450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return norms+i; 22550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 22650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 22750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoconst Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const { 22850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return norms[utrie2_get32(normTrie, c)]; 22950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 23050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 23150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* 23250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * get or create a Norm unit; 23350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * get or create the intermediate trie entries for it as well 23450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 23550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNorm *Normalizer2DataBuilder::createNorm(UChar32 c) { 23650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint32_t i=utrie2_get32(normTrie, c); 23750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(i!=0) { 23850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return norms+i; 23950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 24050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho /* allocate Norm */ 24150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm *p=allocNorm(); 24250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IcuToolErrorCode errorCode("gennorm2/createNorm()"); 24350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode); 24450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return p; 24550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 24650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 24750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 24850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNorm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) { 24950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p!=NULL) { 25050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->mappingType!=Norm::NONE) { 25150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( overrideHandling==OVERRIDE_NONE || 25250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase) 25350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 25450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 25550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "error in gennorm2 phase %d: " 25650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "not permitted to override mapping for U+%04lX from phase %d\n", 25750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (int)phase, (long)c, (int)p->mappingPhase); 25850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 25950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 26050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho delete p->mapping; 26150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->mapping=NULL; 26250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 26350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->mappingPhase=phase; 26450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 26550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return p; 26650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 26750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 26850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) { 26950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho overrideHandling=oh; 27050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++phase; 27150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 27250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 27350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) { 27450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho createNorm(c)->cc=cc; 27550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 27650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 27750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehouint8_t Normalizer2DataBuilder::getCC(UChar32 c) const { 27850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return getNormRef(c).cc; 27950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 28050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 28150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UBool isWellFormed(const UnicodeString &s) { 28250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UErrorCode errorCode=U_ZERO_ERROR; 28350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode); 28450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR; 28550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 28650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 28750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) { 28850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!isWellFormed(m)) { 28950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 29050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "error in gennorm2 phase %d: " 29150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "illegal one-way mapping from U+%04lX to malformed string\n", 29250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (int)phase, (long)c); 29350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 29450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 29550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm *p=checkNormForMapping(createNorm(c), c); 29650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->mapping=new UnicodeString(m); 29750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->mappingType=Norm::ONE_WAY; 29850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->setMappingCP(); 29950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 30050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 30150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) { 30250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(U_IS_SURROGATE(c)) { 30350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 30450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "error in gennorm2 phase %d: " 30550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "illegal round-trip mapping from surrogate code point U+%04lX\n", 30650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (int)phase, (long)c); 30750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 30850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 30950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(!isWellFormed(m)) { 31050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 31150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "error in gennorm2 phase %d: " 31250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "illegal round-trip mapping from U+%04lX to malformed string\n", 31350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (int)phase, (long)c); 31450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 31550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 31650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t numCP=u_countChar32(m.getBuffer(), m.length()); 31750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(numCP!=2) { 31850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 31950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "error in gennorm2 phase %d: " 32050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "illegal round-trip mapping from U+%04lX to %d!=2 code points\n", 32150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (int)phase, (long)c, (int)numCP); 32250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 32350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 32450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm *p=checkNormForMapping(createNorm(c), c); 32550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->mapping=new UnicodeString(m); 32650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->mappingType=Norm::ROUND_TRIP; 32750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->mappingCP=U_SENTINEL; 32850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 32950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 33050294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::removeMapping(UChar32 c) { 33150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm *p=checkNormForMapping(getNorm(c), c); 33250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p!=NULL) { 33350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->mappingType=Norm::REMOVED; 33450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 33550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 33650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 33750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass CompositionBuilder : public Normalizer2DBEnumerator { 33850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 33950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 34050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 34150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho builder.addComposition(start, end, value); 34250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 34350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 34450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 34550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 34650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid 34750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) { 34850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norms[value].mappingType==Norm::ROUND_TRIP) { 34950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start!=end) { 35050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 35150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: same round-trip mapping for " 35250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "more than 1 code point U+%04lX..U+%04lX\n", 35350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start, (long)end); 35450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 35550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 35650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norms[value].cc!=0) { 35750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 35850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 35950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX has a round-trip mapping and ccc!=0, " 36050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "not possible in Unicode normalization\n", 36150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start); 36250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 36350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 36450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // setRoundTripMapping() ensured that there are exactly two code points. 36550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UnicodeString &m=*norms[value].mapping; 36650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 lead=m.char32At(0); 36750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 trail=m.char32At(m.length()-1); 36850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(getCC(lead)!=0) { 36950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 37050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 37150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 37250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "not possible in Unicode normalization\n", 37350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start, (long)lead); 37450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 37550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 37650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Flag for trailing character. 37750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho createNorm(trail)->combinesBack=TRUE; 37850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Insert (trail, composite) pair into compositions list for the lead character. 37927f654740f2a26ad62a5c155af9199af9e69b889claireho IcuToolErrorCode errorCode("gennorm2/addComposition()"); 38050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm *leadNorm=createNorm(lead); 38127f654740f2a26ad62a5c155af9199af9e69b889claireho UVector32 *compositions=leadNorm->compositions; 38227f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t i; 38350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(compositions==NULL) { 38427f654740f2a26ad62a5c155af9199af9e69b889claireho compositions=leadNorm->compositions=new UVector32(errorCode); 38527f654740f2a26ad62a5c155af9199af9e69b889claireho i=0; // "insert" the first pair at index 0 38650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 38750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Insertion sort, and check for duplicate trail characters. 38827f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t length; 38927f654740f2a26ad62a5c155af9199af9e69b889claireho const CompositionPair *pairs=leadNorm->getCompositionPairs(length); 39027f654740f2a26ad62a5c155af9199af9e69b889claireho for(i=0; i<length; ++i) { 39127f654740f2a26ad62a5c155af9199af9e69b889claireho if(trail==pairs[i].trail) { 39250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 39350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: same round-trip mapping for " 39450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 39550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start, (long)lead, (long)trail); 39650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 39750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 39827f654740f2a26ad62a5c155af9199af9e69b889claireho if(trail<pairs[i].trail) { 39950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 40050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 40150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 40250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 40327f654740f2a26ad62a5c155af9199af9e69b889claireho compositions->insertElementAt(trail, 2*i, errorCode); 40427f654740f2a26ad62a5c155af9199af9e69b889claireho compositions->insertElementAt(start, 2*i+1, errorCode); 40550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 40650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 40750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 40850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm, 40950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t lowCC, uint8_t highCC) const { 41027f654740f2a26ad62a5c155af9199af9e69b889claireho if((highCC-lowCC)>=2) { 41127f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t length; 41227f654740f2a26ad62a5c155af9199af9e69b889claireho const CompositionPair *pairs=norm.getCompositionPairs(length); 41327f654740f2a26ad62a5c155af9199af9e69b889claireho for(int32_t i=0; i<length; ++i) { 41427f654740f2a26ad62a5c155af9199af9e69b889claireho uint8_t trailCC=getCC(pairs[i].trail); 41550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(lowCC<trailCC && trailCC<highCC) { 41650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 41750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 41850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 41950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 42050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 42150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 42250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 42350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const { 42427f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t length; 42527f654740f2a26ad62a5c155af9199af9e69b889claireho const CompositionPair *pairs=norm.getCompositionPairs(length); 42627f654740f2a26ad62a5c155af9199af9e69b889claireho for(int32_t i=0; i<length; ++i) { 42727f654740f2a26ad62a5c155af9199af9e69b889claireho if(trail==pairs[i].trail) { 42827f654740f2a26ad62a5c155af9199af9e69b889claireho return pairs[i].composite; 42927f654740f2a26ad62a5c155af9199af9e69b889claireho } 43027f654740f2a26ad62a5c155af9199af9e69b889claireho if(trail<pairs[i].trail) { 43127f654740f2a26ad62a5c155af9199af9e69b889claireho break; 43250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 43350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 43450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return U_SENTINEL; 43550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 43650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 43750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass Decomposer : public Normalizer2DBEnumerator { 43850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 43950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {} 44050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 44150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho didDecompose|=builder.decompose(start, end, value); 44250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 44350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 44450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool didDecompose; 44550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 44650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 44750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool 44850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) { 44950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norms[value].hasMapping()) { 45083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius Norm &norm=norms[value]; 45183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius const UnicodeString &m=*norm.mapping; 45250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString *decomposed=NULL; 45350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *s=m.getBuffer(); 45450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length=m.length(); 45550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t prev, i=0; 45650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c; 45750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(i<length) { 45850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prev=i; 45950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(s, i, length, c); 46050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start<=c && c<=end) { 46150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 46250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 46350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)c); 46450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 46550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 46650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Norm &cNorm=getNormRef(c); 46750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cNorm.hasMapping()) { 46883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(norm.mappingType==Norm::ROUND_TRIP) { 46950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(prev==0) { 47050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cNorm.mappingType!=Norm::ROUND_TRIP) { 47150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 47250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 47350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX's round-trip mapping's starter " 47450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX one-way-decomposes, " 47550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "not possible in Unicode normalization\n", 47650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start, (long)c); 47750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 47850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 47950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t myTrailCC=getCC(m.char32At(i)); 48050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1); 48150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t cTrailCC=getCC(cTrailChar); 48250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cTrailCC>myTrailCC) { 48350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 48450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 48550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX's round-trip mapping's starter " 48650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX decomposes and the " 48750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "inner/earlier tccc=%hu > outer/following tccc=%hu, " 48850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "not possible in Unicode normalization\n", 48950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start, (long)c, 49050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (short)cTrailCC, (short)myTrailCC); 49150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 49250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 49350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 49450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 49550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 49650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX's round-trip mapping's non-starter " 49750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX decomposes, " 49850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "not possible in Unicode normalization\n", 49950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start, (long)c); 50050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 50150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 50250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 50350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(decomposed==NULL) { 50450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho decomposed=new UnicodeString(m, 0, prev); 50550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 50650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho decomposed->append(*cNorm.mapping); 50750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(Hangul::isHangul(c)) { 50850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar buffer[3]; 50950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t hangulLength=Hangul::decompose(c, buffer); 51083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) { 51150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 51250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 51350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX's round-trip mapping's non-starter " 51450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX decomposes, " 51550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "not possible in Unicode normalization\n", 51650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start, (long)c); 51750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 51850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 51950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(decomposed==NULL) { 52050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho decomposed=new UnicodeString(m, 0, prev); 52150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 52250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho decomposed->append(buffer, hangulLength); 52350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(decomposed!=NULL) { 52450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho decomposed->append(m, prev, i-prev); 52550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 52650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 52750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(decomposed!=NULL) { 52883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(norm.rawMapping==NULL) { 52983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Remember the original mapping when decomposing recursively. 53083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius norm.rawMapping=norm.mapping; 53183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else { 53283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius delete norm.mapping; 53383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 53483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius norm.mapping=decomposed; 53583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Not norm.setMappingCP(); because the original mapping 53650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // is most likely to be encodable as a delta. 53750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 53850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 53950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 54050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; 54150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 54250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 54350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass BuilderReorderingBuffer { 54450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 54550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} 54650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void reset() { 54750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fLength=0; 54850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fLastStarterIndex=-1; 54950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fDidReorder=FALSE; 55050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 55150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length() const { return fLength; } 55250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isEmpty() const { return fLength==0; } 55350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t lastStarterIndex() const { return fLastStarterIndex; } 55450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 charAt(int32_t i) const { return fArray[i]>>8; } 55550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } 55650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool didReorder() const { return fDidReorder; } 55750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void append(UChar32 c, uint8_t cc) { 55850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) { 55950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(cc==0) { 56050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fLastStarterIndex=fLength; 56150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 56250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fArray[fLength++]=(c<<8)|cc; 56350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; 56450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 56550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Let this character bubble back to its canonical order. 56650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t i=fLength-1; 56750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(i>fLastStarterIndex && ccAt(i)>cc) { 56850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --i; 56950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 57050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++i; // after the last starter or prevCC<=cc 57150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Move this and the following characters forward one to make space. 57250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(int32_t j=fLength; i<j; --j) { 57350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fArray[j]=fArray[j-1]; 57450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 57550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fArray[i]=(c<<8)|cc; 57650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++fLength; 57750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fDidReorder=TRUE; 57850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 57950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void toString(UnicodeString &dest) { 58050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dest.remove(); 58150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(int32_t i=0; i<fLength; ++i) { 58250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dest.append(charAt(i)); 58350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 58450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 58550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho void setComposite(UChar32 composite, int32_t combMarkIndex) { 58650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fArray[fLastStarterIndex]=composite<<8; 58750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Remove the combining mark that contributed to the composite. 58850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho --fLength; 58950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(combMarkIndex<fLength) { 59050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fArray[combMarkIndex]=fArray[combMarkIndex+1]; 59150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++combMarkIndex; 59250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 59350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 59450294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoprivate: 59550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; 59650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t fLength; 59750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t fLastStarterIndex; 59850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool fDidReorder; 59950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 60050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 60150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid 60250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoNormalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) { 60350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString &m=*p->mapping; 60450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length=m.length(); 60550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 60650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return; // writeMapping() will complain about it and print the code point. 60750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 60850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const UChar *s=m.getBuffer(); 60950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t i=0; 61050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 c; 61150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while(i<length) { 61250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho U16_NEXT(s, i, length, c); 61350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.append(c, getCC(c)); 61450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 61550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(buffer.didReorder()) { 61650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.toString(m); 61750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 61850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 61950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 62083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius/* 62183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter(). 62283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * A starter character with a mapping does not have a composition boundary after it 62383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * if the character itself combines-forward (which is tested by the caller of this function), 62483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * or it is deleted (mapped to the empty string), 62583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * or its mapping contains no starter, 62683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius * or the last starter combines-forward. 62783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius */ 62850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoUBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) { 62950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(buffer.isEmpty()) { 63083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return TRUE; // maps-to-empty-string is no boundary of any kind 63150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 63250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t lastStarterIndex=buffer.lastStarterIndex(); 63350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(lastStarterIndex<0) { 63450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; // no starter 63550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 63650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 starter=buffer.charAt(lastStarterIndex); 63750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( Hangul::isJamoL(starter) || 63850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (Hangul::isJamoV(starter) && 63950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1))) 64050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 64150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // A Jamo leading consonant or an LV pair combines-forward if it is at the end, 64250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // otherwise it is blocked. 64350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return lastStarterIndex==buffer.length()-1; 64450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 64583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Note: There can be no Hangul syllable in the fully decomposed mapping. 64650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Norm *starterNorm=&getNormRef(starter); 64750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(starterNorm->compositions==NULL) { 64850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; // the last starter does not combine forward 64950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 65050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Compose as far as possible, and see if further compositions are possible. 65150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t prevCC=0; 65250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) { 65350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter 65450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(combinesWithCCBetween(*starterNorm, prevCC, cc)) { 65550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 65650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 65750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( prevCC<cc && 65850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0 65950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 66050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho buffer.setComposite(starter, combMarkIndex); 66150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho starterNorm=&getNormRef(starter); 66250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(starterNorm->compositions==NULL) { 66350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return FALSE; // the composite does not combine further 66450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 66550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 66650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho prevCC=cc; 66750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ++combMarkIndex; 66850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 66950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 67050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // TRUE if the final, forward-combining starter is at the end. 67150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return prevCC==0; 67250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 67350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 67450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Requires p->hasMapping(). 67583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius// Returns the offset of the "first unit" from the beginning of the extraData for c. 67683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius// That is the same as the length of the optional data for the raw mapping and the ccc/lccc word. 67783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Corneliusint32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) { 67850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString &m=*p->mapping; 67950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t length=m.length(); 68050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) { 68150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 68250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 68350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "mapping for U+%04lX longer than maximum of %d\n", 68450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 68550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 68650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 68750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t leadCC, trailCC; 68850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(length==0) { 68950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho leadCC=trailCC=0; 69050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 69150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho leadCC=getCC(m.char32At(0)); 69250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho trailCC=getCC(m.char32At(length-1)); 69350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 69450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) { 69550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 69650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 69750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n", 69850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)c); 69950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 70050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 70183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Write small-FCD data. 70283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if((leadCC|trailCC)!=0) { 70383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 70483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 70583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 70683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Write the mapping & raw mapping extraData. 70750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t firstUnit=length|(trailCC<<8); 70883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius int32_t preMappingLength=0; 70983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(p->rawMapping!=NULL) { 71083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UnicodeString &rm=*p->rawMapping; 71183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius int32_t rmLength=rm.length(); 71283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { 71383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius fprintf(stderr, 71483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius "gennorm2 error: " 71583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius "raw mapping for U+%04lX longer than maximum of %d\n", 71683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); 71783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius exit(U_INVALID_FORMAT_ERROR); 71883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 71983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar rm0=rm.charAt(0); 72083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if( rmLength==length-1 && 72183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // 99: overlong substring lengths get pinned to remainder lengths anyway 72283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius 0==rm.compare(1, 99, m, 2, 99) && 72383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius rm0>Normalizer2Impl::MAPPING_LENGTH_MASK 72483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius ) { 72583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Compression: 72683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // rawMapping=rm0+mapping.substring(2) -> store only rm0 72783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // 72883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // The raw mapping is the same as the final mapping after replacing 72983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // the final mapping's first two code units with the raw mapping's first one. 73083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // In this case, we store only that first unit, rm0. 73183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // This helps with a few hundred mappings. 73283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius dataString.append(rm0); 73383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius preMappingLength=1; 73483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else { 73583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Store the raw mapping with its length. 73683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius dataString.append(rm); 73783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius dataString.append((UChar)rmLength); 73883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius preMappingLength=rmLength+1; 73983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 74083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; 74150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 74283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius int32_t cccLccc=p->cc|(leadCC<<8); 74383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(cccLccc!=0) { 74483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius dataString.append((UChar)cccLccc); 74583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius ++preMappingLength; 74683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; 74750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 74850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->hasNoCompBoundaryAfter) { 74950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER; 75050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 75150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dataString.append((UChar)firstUnit); 75250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dataString.append(m); 75383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius return preMappingLength; 75450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 75550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 75650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho// Requires p->compositions!=NULL. 75750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) { 75850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->cc!=0) { 75950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 76050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 76150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", 76250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)c); 76350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 76450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 76527f654740f2a26ad62a5c155af9199af9e69b889claireho int32_t length; 76627f654740f2a26ad62a5c155af9199af9e69b889claireho const CompositionPair *pairs=p->getCompositionPairs(length); 76750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(int32_t i=0; i<length; ++i) { 76827f654740f2a26ad62a5c155af9199af9e69b889claireho const CompositionPair &pair=pairs[i]; 76950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 22 bits for the composite character and whether it combines forward. 77050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UChar32 compositeAndFwd=pair.composite<<1; 77150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(getNormRef(pair.composite).compositions!=NULL) { 77250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho compositeAndFwd|=1; // The composite character also combines-forward. 77350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 77450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Encode most pairs in two units and some in three. 77550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t firstUnit, secondUnit, thirdUnit; 77650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { 77750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(compositeAndFwd<=0xffff) { 77850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho firstUnit=pair.trail<<1; 77950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho secondUnit=compositeAndFwd; 78050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho thirdUnit=-1; 78150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 78250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; 78350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho secondUnit=compositeAndFwd>>16; 78450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho thirdUnit=compositeAndFwd; 78550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 78650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 78750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ 78850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| 78950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Normalizer2Impl::COMP_1_TRIPLE; 79050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| 79150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (compositeAndFwd>>16); 79250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho thirdUnit=compositeAndFwd; 79350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 79450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Set the high bit of the first unit if this is the last composition pair. 79550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(i==(length-1)) { 79650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; 79750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 79850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dataString.append((UChar)firstUnit).append((UChar)secondUnit); 79950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(thirdUnit>=0) { 80050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho dataString.append((UChar)thirdUnit); 80150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 80250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 80350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 80450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 80550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass ExtraDataWriter : public Normalizer2DBEnumerator { 80650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 80750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ExtraDataWriter(Normalizer2DataBuilder &b) : 80850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Normalizer2DBEnumerator(b), 80950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions 81083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data 81150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 81250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(value!=0) { 81350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(start!=end) { 81450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 81550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: unexpected shared data for " 81650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "multiple code points U+%04lX..U+%04lX\n", 81750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)start, (long)end); 81850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INTERNAL_PROGRAM_ERROR); 81950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 82050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho builder.writeExtraData(start, value, *this); 82150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 82250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 82350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 82450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString maybeYesCompositions; 82550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString yesYesCompositions; 82683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UnicodeString yesNoMappingsAndCompositions; 82783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UnicodeString yesNoMappingsOnly; 82850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString noNoMappings; 82950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode. 83050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 83150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 83250294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) { 83350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm *p=norms+value; 83483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(!p->hasMapping()) { 83583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Write small-FCD data. 83683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // There is similar code in writeMapping() for characters that do have a mapping. 83783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) { 83883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius fprintf(stderr, 83983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius "gennorm2 error: " 84083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n", 84183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius (long)c); 84283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius exit(U_INVALID_FORMAT_ERROR); 84383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 84483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(p->cc!=0) { 84583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UChar32 lead= c<=0xffff ? c : U16_LEAD(c); 84683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7); 84783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 84883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 84950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->combinesBack) { 85050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->hasMapping()) { 85150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 85250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 85350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n", 85450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)c); 85550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 85650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 85750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->compositions!=NULL) { 85850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->offset= 85950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)| 86050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm::OFFSET_MAYBE_YES; 86150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho writeCompositions(c, p, writer.maybeYesCompositions); 86250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 86350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(!p->hasMapping()) { 86450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->compositions!=NULL) { 86550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->offset= 86650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)| 86750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm::OFFSET_YES_YES; 86850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho writeCompositions(c, p, writer.yesYesCompositions); 86950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 87050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(p->mappingType==Norm::ROUND_TRIP) { 87150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->compositions!=NULL) { 87283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius int32_t offset=writer.yesNoMappingsAndCompositions.length()+ 87383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius writeMapping(c, p, writer.yesNoMappingsAndCompositions); 87483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION; 87583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius writeCompositions(c, p, writer.yesNoMappingsAndCompositions); 87683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else { 87783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius int32_t offset=writer.yesNoMappingsOnly.length()+ 87883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius writeMapping(c, p, writer.yesNoMappingsOnly); 87983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY; 88050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 88150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else /* one-way */ { 88250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->compositions!=NULL) { 88350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 88450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 88550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "U+%04lX combines-forward and has a one-way mapping, " 88650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "not possible in Unicode normalization\n", 88750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)c); 88850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 88950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 89050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->cc==0 && optimization!=OPTIMIZE_FAST) { 89150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Try a compact, algorithmic encoding. 89283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Only for ccc=0, because we can't store additional information 89383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // and we do not recursively follow an algorithmic encoding for access to the ccc. 89483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // 89583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding 89683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // if the mappingCP decomposes further, to ensure that there is a place to store it. 89783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // We want to see that the final mapping does not have exactly 1 code point, 89883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // or else we would have to recursively ensure that the final mapping is stored 89983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // in normal extraData. 90083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) { 90150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t delta=p->mappingCP-c; 90250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { 90350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA; 90450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 90550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 90650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 90750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->offset==0) { 90850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t oldNoNoLength=writer.noNoMappings.length(); 90983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings); 91050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength); 91150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping); 91250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(previousOffset!=0) { 91350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Duplicate, remove the new units and point to the old ones. 91450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho writer.noNoMappings.truncate(oldNoNoLength); 91583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 91650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 91750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Enter this new mapping into the hashtable, avoiding value 0 which is "not found". 91850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); 91983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode); 92083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO; 92150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 92250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 92350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 92450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 92550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 92650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoclass Norm16Writer : public Normalizer2DBEnumerator { 92750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehopublic: 92850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {} 92950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) { 93050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho builder.writeNorm16(start, end, value); 93150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 93250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 93350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho}; 93450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 93550294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) { 93650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(value!=0) { 93750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const Norm *p=norms+value; 93850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t offset=p->offset>>Norm::OFFSET_SHIFT; 93950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t norm16=0; 94050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isDecompNo=FALSE; 94150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UBool isCompNoMaybe=FALSE; 94250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho switch(p->offset&Norm::OFFSET_MASK) { 94350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case Norm::OFFSET_NONE: 94450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // No mapping, no compositions list. 94550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(p->combinesBack) { 94650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc; 94750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isDecompNo=(UBool)(p->cc!=0); 94850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isCompNoMaybe=TRUE; 94950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else if(p->cc!=0) { 95050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc; 95150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isDecompNo=isCompNoMaybe=TRUE; 95250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 95350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 95450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case Norm::OFFSET_MAYBE_YES: 95550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset; 95650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isCompNoMaybe=TRUE; 95750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 95850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case Norm::OFFSET_YES_YES: 95950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=offset; 96050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 96183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION: 96250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset; 96350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isDecompNo=TRUE; 96450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 96583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius case Norm::OFFSET_YES_NO_MAPPING_ONLY: 96683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset; 96783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius isDecompNo=TRUE; 96883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius break; 96950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case Norm::OFFSET_NO_NO: 97050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset; 97150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isDecompNo=isCompNoMaybe=TRUE; 97250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 97350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho case Norm::OFFSET_DELTA: 97450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=getCenterNoNoDelta()+offset; 97550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho isDecompNo=isCompNoMaybe=TRUE; 97650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho break; 97750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho default: // Should not occur. 97850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INTERNAL_PROGRAM_ERROR); 97950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 98050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IcuToolErrorCode errorCode("gennorm2/writeNorm16()"); 98150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode); 98250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 98350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start; 98450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 98550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { 98650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start; 98750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 98850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 98950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 99050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 99150294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::setHangulData() { 99250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho HangulIterator hi; 99350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho const HangulIterator::Range *range; 99450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Check that none of the Hangul/Jamo code points have data. 99550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while((range=hi.nextRange())!=NULL) { 99650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(UChar32 c=range->start; c<range->limit; ++c) { 99750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(utrie2_get32(norm16Trie, c)!=0) { 99850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 99950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 100050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n", 100150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)c); 100250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INVALID_FORMAT_ERROR); 100350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 100450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 100550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 100650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Set data for algorithmic runtime handling. 100750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IcuToolErrorCode errorCode("gennorm2/setHangulData()"); 100850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho hi.reset(); 100950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho while((range=hi.nextRange())!=NULL) { 101050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint16_t norm16=range->norm16; 101150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(norm16==0) { 101250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo 101350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) { 101450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start; 101550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 101650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } else { 101750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes 101850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start; 101950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 102050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 102150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode); 102250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho errorCode.assertSuccess(); 102350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 102450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 102550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 102650294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_BEGIN 102750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 102850294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehostatic UBool U_CALLCONV 102950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoenumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 103050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint32_t *pMaxValue=(uint32_t *)context; 103150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(value>*pMaxValue) { 103250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho *pMaxValue=value; 103350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 103450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho return TRUE; 103550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 103650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 103750294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_CDECL_END 103850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 103950294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehovoid Normalizer2DataBuilder::processData() { 104050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho IcuToolErrorCode errorCode("gennorm2/processData()"); 104150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho norm16Trie=utrie2_open(0, 0, errorCode); 104250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho errorCode.assertSuccess(); 104350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 104450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr()); 104550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 104650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Decomposer decomposer(*this); 104750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho do { 104850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho decomposer.didDecompose=FALSE; 104950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer); 105050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } while(decomposer.didDecompose); 105150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 105250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho BuilderReorderingBuffer buffer; 105350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t normsLength=utm_countItems(normMem); 105450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(int32_t i=1; i<normsLength; ++i) { 105583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // Set the hasNoCompBoundaryAfter flag for use by the last code branch 105683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // in Normalizer2Impl::hasCompBoundaryAfter(). 105783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius // For details see the comments on hasNoCompBoundaryAfter(buffer). 105883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius const Norm &norm=norms[i]; 105983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(norm.hasMapping()) { 106083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(norm.compositions!=NULL) { 106183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius norms[i].hasNoCompBoundaryAfter=TRUE; 106283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } else { 106383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius buffer.reset(); 106483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius reorder(norms+i, buffer); 106583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer); 106683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 106750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 106850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 106950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 107050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000; 107150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000; 107250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 107350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ExtraDataWriter extraDataWriter(*this); 107450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter); 107550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 107650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho extraData=extraDataWriter.maybeYesCompositions; 107750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho extraData.append(extraDataWriter.yesYesCompositions). 107883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius append(extraDataWriter.yesNoMappingsAndCompositions). 107983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius append(extraDataWriter.yesNoMappingsOnly). 108050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho append(extraDataWriter.noNoMappings); 108150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Pad to even length for 4-byte alignment of following data. 108250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(extraData.length()&1) { 108350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho extraData.append((UChar)0); 108450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 108550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 108650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_YES_NO]= 108750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho extraDataWriter.yesYesCompositions.length(); 108883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]= 108950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_YES_NO]+ 109083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius extraDataWriter.yesNoMappingsAndCompositions.length(); 109183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius indexes[Normalizer2Impl::IX_MIN_NO_NO]= 109283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+ 109383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius extraDataWriter.yesNoMappingsOnly.length(); 109450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_LIMIT_NO_NO]= 109550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_NO_NO]+ 109650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho extraDataWriter.noNoMappings.length(); 109750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]= 109850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho Normalizer2Impl::MIN_NORMAL_MAYBE_YES- 109950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho extraDataWriter.maybeYesCompositions.length(); 110050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 110150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA; 110250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) { 110350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, 110450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "gennorm2 error: " 110550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho "data structure overflow, too much mapping composition data\n"); 110650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_BUFFER_OVERFLOW_ERROR); 110750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 110850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 110950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr()); 111050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 111150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho setHangulData(); 111250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 111350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Look for the "worst" norm16 value of any supplementary code point 111450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // corresponding to a lead surrogate, and set it as that surrogate's value. 111550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Enables quick check inner loops to look at only code units. 111650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // 111750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // We could be more sophisticated: 111850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // We could collect a bit set for whether there are values in the different 111950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.) 112050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // and select the best value that only breaks the composition and/or decomposition 112150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // inner loops if necessary. 112250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // However, that seems like overkill for an optimization for supplementary characters. 112350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho for(UChar lead=0xd800; lead<0xdc00; ++lead) { 112450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho uint32_t maxValue=utrie2_get32(norm16Trie, lead); 112550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue); 112650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] && 112750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO] 112850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho ) { 112950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0. 113050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Otherwise it might end up at something like JAMO_VT which stays in 113150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // the inner decomposition quick check loop. 113250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1; 113350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 113450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode); 113550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 113650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 113750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // Adjust supplementary minimum code points to break quick check loops at their lead surrogates. 113850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate) 113950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // which is harmless. 114050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho // As a result, the minimum code points are always BMP code points. 114150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]; 114250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(minCP>=0x10000) { 114350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP); 114450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 114550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]; 114650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(minCP>=0x10000) { 114750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP); 114850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 114950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 115050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode); 1151f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode); 115250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) { 115350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n", 115450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho errorCode.errorName()); 115550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(errorCode.reset()); 115650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 115750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho errorCode.reset(); 115850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 115950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t offset=(int32_t)sizeof(indexes); 116050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset; 116150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho offset+=norm16TrieLength; 116250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset; 116383a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius offset+=extraData.length()*2; 116483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset; 116583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius offset+=sizeof(smallFCD); 116683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius int32_t totalSize=offset; 116783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) { 116850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho indexes[i]=totalSize; 116950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 117050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 117150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(beVerbose) { 117250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("size of normalization trie: %5ld bytes\n", (long)norm16TrieLength); 117350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("size of 16-bit extra data: %5ld uint16_t\n", (long)extraData.length()); 117483a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius printf("size of small-FCD data: %5ld bytes\n", (long)sizeof(smallFCD)); 117550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("size of binary data file contents: %5ld bytes\n", (long)totalSize); 117650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]); 117750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]); 117850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]); 117983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]); 118050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]); 118150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]); 118250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]); 118350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 118450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 118583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius UVersionInfo nullVersion={ 0, 0, 0, 0 }; 118683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if(0==memcmp(nullVersion, unicodeVersion, 4)) { 118783a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius u_versionFromString(unicodeVersion, U_UNICODE_VERSION); 118883a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius } 118950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho memcpy(dataInfo.dataVersion, unicodeVersion, 4); 1190f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 1191f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 1192f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid Normalizer2DataBuilder::writeBinaryFile(const char *filename) { 1193f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius processData(); 1194f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 1195f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()"); 1196f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1197f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1198f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius errorCode.assertSuccess(); 1199f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 120050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho UNewDataMemory *pData= 120150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho udata_create(NULL, NULL, filename, &dataInfo, 120250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode); 120350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(errorCode.isFailure()) { 120450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n", 120550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho filename, errorCode.errorName()); 120650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(errorCode.reset()); 120750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 120850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho udata_writeBlock(pData, indexes, sizeof(indexes)); 120950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength); 121050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho udata_writeUString(pData, extraData.getBuffer(), extraData.length()); 121183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius udata_writeBlock(pData, smallFCD, sizeof(smallFCD)); 121250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho int32_t writtenSize=udata_finish(pData, errorCode); 121350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(errorCode.isFailure()) { 121450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName()); 121550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(errorCode.reset()); 121650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 1217f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE]; 121850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho if(writtenSize!=totalSize) { 121950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n", 122050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho (long)writtenSize, (long)totalSize); 122150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho exit(U_INTERNAL_PROGRAM_ERROR); 122250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho } 122350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho} 122450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 1225f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid 1226f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusNormalizer2DataBuilder::writeCSourceFile(const char *filename) { 1227f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius processData(); 1228f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 1229f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()"); 1230f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const char *basename=findBasename(filename); 1231f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius CharString path(filename, (int32_t)(basename-filename), errorCode); 1232f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius CharString dataName(basename, errorCode); 1233f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius const char *extension=strrchr(basename, '.'); 1234f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(extension!=NULL) { 1235f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius dataName.truncate((int32_t)(extension-basename)); 1236f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 1237f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius errorCode.assertSuccess(); 1238f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 1239f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]); 1240f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode); 1241f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius errorCode.assertSuccess(); 1242f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 1243f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp"); 1244f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius if(f==NULL) { 1245f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n", 1246f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius filename); 1247f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius exit(U_FILE_ACCESS_ERROR); 1248f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius return; 1249f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius } 1250f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius char line[100]; 1251f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data()); 1252f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n"); 1253f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data()); 1254f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n"); 1255f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n", 1256f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius dataName.data()); 1257f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius usrc_writeArray(f, 1258f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius line, 1259f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius indexes, 32, Normalizer2Impl::IX_COUNT, 1260f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius "\n};\n\n"); 1261f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data()); 1262f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius usrc_writeUTrie2Arrays(f, 1263f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius line, NULL, 1264f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius norm16Trie, 1265f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius "\n};\n\n"); 1266f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data()); 1267f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius usrc_writeArray(f, 1268f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius line, 1269f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius extraData.getBuffer(), 16, extraData.length(), 1270f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius "\n};\n\n"); 1271f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data()); 1272f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius usrc_writeArray(f, 1273f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius line, 1274f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius smallFCD, 8, sizeof(smallFCD), 1275f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius "\n};\n\n"); 1276f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius /*fputs( // TODO 1277f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius "static const UCaseProps %s_singleton={\n" 1278f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius " NULL,\n" 1279f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius " %s_indexes,\n" 1280f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius " %s_extraData,\n" 1281f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius " %s_smallFCD,\n", 1282f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius f);*/ 1283f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data()); 1284f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius char line2[100]; 1285f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius sprintf(line2, "%s_trieIndex", dataName.data()); 1286f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius usrc_writeUTrie2Struct(f, 1287f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius line, 1288f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius norm16Trie, line2, NULL, 1289f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius "};\n"); 1290f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius fclose(f); 1291f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius} 1292f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius 129350294ead5e5d23f5bbfed76e00e6b510bd41eee1clairehoU_NAMESPACE_END 129450294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 129550294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#endif /* #if !UCONFIG_NO_NORMALIZATION */ 129650294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho 129750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho/* 129850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Hey, Emacs, please set the following: 129950294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * 130050294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * Local Variables: 130150294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * indent-tabs-mode: nil 130250294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho * End: 130350294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho */ 1304