10596faeddefbf198de137d5e893708495ab1584cFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/* 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 51b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert* Copyright (C) 2012-2015, International Business Machines 6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Corporation and others. All Rights Reserved. 7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* collationdatabuilder.cpp 9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 10fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* (replaced the former ucol_elm.cpp) 11fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created on: 2012apr01 13fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created by: Markus W. Scherer 14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/ 15fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 16fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utypes.h" 17fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#if !UCONFIG_NO_COLLATION 19fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 20fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/localpointer.h" 21fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uchar.h" 22fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/ucharstrie.h" 23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/ucharstriebuilder.h" 24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uniset.h" 25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/unistr.h" 26fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/usetiter.h" 27fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utf16.h" 28fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "cmemory.h" 29fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collation.h" 30fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdata.h" 31fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdatabuilder.h" 32fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationfastlatinbuilder.h" 33fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationiterator.h" 34fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "normalizer2impl.h" 35fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "utrie2.h" 36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "uvectr32.h" 37fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "uvectr64.h" 38fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "uvector.h" 39fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 40fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN 41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 42fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::CEModifier::~CEModifier() {} 43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 44fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/** 45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Build-time context and CE32 for a code point. 46fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * If a code point has contextual mappings, then the default (no-context) mapping 47fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * and all conditional mappings are stored in a singly-linked list 48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * of ConditionalCE32, sorted by context strings. 49fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 50fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Context strings sort by prefix length, then by prefix, then by contraction suffix. 51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Context strings must be unique and in ascending order. 52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstruct ConditionalCE32 : public UMemory { 54f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ConditionalCE32() 55f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius : context(), 56f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ce32(0), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), 57f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius next(-1) {} 58fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32(const UnicodeString &ct, uint32_t ce) 59fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : context(ct), 60fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32(ce), defaultCE32(Collation::NO_CE32), builtCE32(Collation::NO_CE32), 61fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius next(-1) {} 62fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 63fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius inline UBool hasContext() const { return context.length() > 1; } 64fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius inline int32_t prefixLength() const { return context.charAt(0); } 65fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 66fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 67fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * "\0" for the first entry for any code point, with its default CE32. 68fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 69fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Otherwise one unit with the length of the prefix string, 70fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * then the prefix string, then the contraction suffix. 71fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 72fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString context; 73fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 74fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * CE32 for the code point and its context. 75fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). 76fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 77fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32; 78fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 79fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Default CE32 for all contexts with this same prefix. 80fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Initially NO_CE32. Set only while building runtime data structures, 81fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * and only on one of the nodes of a sub-list with the same prefix. 82fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 83fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t defaultCE32; 84fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 85fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * CE32 for the built contexts. 86fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * When fetching CEs from the builder, the contexts are built into their runtime form 87fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * so that the normal collation implementation can process them. 88fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * The result is cached in the list head. It is reset when the contexts are modified. 89fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 90fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t builtCE32; 91fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 92fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Index of the next ConditionalCE32. 93fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Negative for the end of the list. 94fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 95fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t next; 96fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 97fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 98fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CDECL_BEGIN 99fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CAPI void U_CALLCONV 101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuprv_deleteConditionalCE32(void *obj) { 102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete static_cast<ConditionalCE32 *>(obj); 103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CDECL_END 106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/** 108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Build-time collation element and character iterator. 109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Uses the runtime CollationIterator for fetching CEs for a string 110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * but reads from the builder's unfinished data structures. 111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * In particular, this class reads from the unfinished trie 112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * and has to avoid CollationIterator::nextCE() and redirect other 113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * calls to data->getCE32() and data->getCE32FromSupplementary(). 114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * We do this so that we need not implement the collation algorithm 116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * again for the builder and make it behave exactly like the runtime code. 117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * That would be more difficult to test and maintain than this indirection. 118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, 120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * so the data accesses from those code paths need not be modified. 121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * This class iterates directly over whole code points 123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * so that the CollationIterator does not need the finished trie 124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * for handling the LEAD_SURROGATE_TAG. 125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass DataBuilderCollationIterator : public CollationIterator { 127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius DataBuilderCollationIterator(CollationDataBuilder &b); 129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual ~DataBuilderCollationIterator(); 131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t fetchCEs(const UnicodeString &str, int32_t start, int64_t ces[], int32_t cesLength); 133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual void resetToOffset(int32_t newOffset); 135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual int32_t getOffset() const; 136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual UChar32 nextCodePoint(UErrorCode &errorCode); 138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual UChar32 previousCodePoint(UErrorCode &errorCode); 139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprotected: 141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode); 142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode); 143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual uint32_t getDataCE32(UChar32 c) const; 145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual uint32_t getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode); 146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationDataBuilder &builder; 148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationData builderData; 149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; 150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UnicodeString *s; 151fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t pos; 152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::DataBuilderCollationIterator(CollationDataBuilder &b) 155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : CollationIterator(&builderData, /*numeric=*/ FALSE), 156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builder(b), builderData(b.nfcImpl), 157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s(NULL), pos(0) { 158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builderData.base = builder.base; 159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Set all of the jamoCE32s[] to indirection CE32s. 160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 jamo = CollationDataBuilder::jamoCpFromIndex(j); 162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius jamoCE32s[j] = Collation::makeCE32FromTagAndIndex(Collation::BUILDER_DATA_TAG, jamo) | 163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationDataBuilder::IS_BUILDER_JAMO_CE32; 164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builderData.jamoCE32s = jamoCE32s; 166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::~DataBuilderCollationIterator() {} 169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::fetchCEs(const UnicodeString &str, int32_t start, 172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ces[], int32_t cesLength) { 173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Set the pointers each time, in case they changed due to reallocation. 174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builderData.ce32s = reinterpret_cast<const uint32_t *>(builder.ce32s.getBuffer()); 175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builderData.ces = builder.ce64s.getBuffer(); 176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builderData.contexts = builder.contexts.getBuffer(); 177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Modified copy of CollationIterator::nextCE() and CollationIterator::nextCEFromCE32(). 178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius reset(); 179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = &str; 180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = start; 181fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode errorCode = U_ZERO_ERROR; 182fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(U_SUCCESS(errorCode) && pos < s->length()) { 183fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No need to keep all CEs in the iterator buffer. 184fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius clearCEs(); 185fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = s->char32At(pos); 186fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos += U16_LENGTH(c); 187fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(builder.trie, c); 188fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationData *d; 189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == Collation::FALLBACK_CE32) { 190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius d = builder.base; 191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = builder.base->getCE32(c); 192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius d = &builderData; 194fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 195fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius appendCEsFromCE32(d, c, ce32, /*forward=*/ TRUE, errorCode); 196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(U_SUCCESS(errorCode)); 197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < getCEsLength(); ++i) { 198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = getCE(i); 199fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce != 0) { 200fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength < Collation::MAX_EXPANSION_LENGTH) { 201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[cesLength] = ce; 202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++cesLength; 204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return cesLength; 208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::resetToOffset(int32_t newOffset) { 212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius reset(); 213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = newOffset; 214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::getOffset() const { 218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return pos; 219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUChar32 222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) { 223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == s->length()) { 224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = s->char32At(pos); 227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos += U16_LENGTH(c); 228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUChar32 232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) { 233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(pos == 0) { 234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SENTINEL; 235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = s->char32At(pos - 1); 237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos -= U16_LENGTH(c); 238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return c; 239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = s->moveIndex32(pos, num); 244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) { 248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pos = s->moveIndex32(pos, -num); 249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::getDataCE32(UChar32 c) const { 253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return utrie2_get32(builder.trie, c); 254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusDataBuilderCollationIterator::getCE32FromBuilderData(uint32_t ce32, UErrorCode &errorCode) { 258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(Collation::hasCE32Tag(ce32, Collation::BUILDER_DATA_TAG)); 259fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((ce32 & CollationDataBuilder::IS_BUILDER_JAMO_CE32) != 0) { 260fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 jamo = Collation::indexFromCE32(ce32); 261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return utrie2_get32(builder.trie, jamo); 262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *cond = builder.getConditionalCE32ForCE32(ce32); 264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cond->builtCE32 == Collation::NO_CE32) { 265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Build the context-sensitive mappings into their runtime form and cache the result. 266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->builtCE32 = builder.buildContext(cond, errorCode); 267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ZERO_ERROR; 269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builder.clearContexts(); 270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->builtCE32 = builder.buildContext(cond, errorCode); 271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius builderData.contexts = builder.contexts.getBuffer(); 273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return cond->builtCE32; 275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// ------------------------------------------------------------------------- *** 279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::CollationDataBuilder(UErrorCode &errorCode) 281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), 282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius base(NULL), baseSettings(NULL), 283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius trie(NULL), 284fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32s(errorCode), ce64s(errorCode), conditionalCE32s(errorCode), 285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modified(FALSE), 286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fastLatinEnabled(FALSE), fastLatinBuilder(NULL), 287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius collIter(NULL) { 288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Reserve the first CE32 for U+0000. 289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32s.addElement(0, errorCode); 290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius conditionalCE32s.setDeleter(uprv_deleteConditionalCE32); 291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::~CollationDataBuilder() { 294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_close(trie); 295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete fastLatinBuilder; 296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete collIter; 297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::initForTailoring(const CollationData *b, UErrorCode &errorCode) { 301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(trie != NULL) { 303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_INVALID_STATE_ERROR; 304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(b == NULL) { 307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius base = b; 311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // For a tailoring, the default is to fall back to the base. 313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius trie = utrie2_open(Collation::FALLBACK_CE32, Collation::FFFD_CE32, &errorCode); 314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Set the Latin-1 letters block so that it is allocated first in the data array, 316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // to try to improve locality of reference when sorting Latin-1 text. 317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not use utrie2_setRange32() since that will not actually allocate blocks 318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // that are filled with the default value. 319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // ASCII (0..7F) is already preallocated anyway. 320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(UChar32 c = 0xc0; c <= 0xff; ++c) { 321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, Collation::FALLBACK_CE32, &errorCode); 322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Hangul syllables are not tailorable (except via tailoring Jamos). 325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Always set the Hangul tag to help performance. 326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do this here, rather than in buildMappings(), 327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // so that we see the HANGUL_TAG in various assertions. 328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); 329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode); 330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Copy the set contents but don't copy/clone the set as a whole because 332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // that would copy the isFrozen state too. 333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unsafeBackwardSet.addAll(*b->unsafeBackwardSet); 334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::maybeSetPrimaryRange(UChar32 start, UChar32 end, 340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t primary, int32_t step, 341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return FALSE; } 343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(start <= end); 344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // TODO: Do we need to check what values are currently set for start..end? 345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // An offset range is worth it only if we can achieve an overlap between 346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // adjacent UTrie2 blocks of 32 code points each. 347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // An offset CE is also a little more expensive to look up and compute 348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // than a simple CE. 349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If the range spans at least three UTrie2 block boundaries (> 64 code points), 350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // then we take it. 351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If the range spans one or two block boundaries and there are 352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // at least 4 code points on either side, then we take it. 353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (We could additionally require a minimum range length of, say, 16.) 354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t blockDelta = (end >> 5) - (start >> 5); 355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(2 <= step && step <= 0x7f && 356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (blockDelta >= 3 || 357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (blockDelta > 0 && (start & 0x1f) <= 0x1c && (end & 0x1f) >= 3))) { 358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t dataCE = ((int64_t)primary << 32) | (start << 8) | step; 359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isCompressiblePrimary(primary)) { dataCE |= 0x80; } 360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = addCE(dataCE, errorCode); 361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(index > Collation::MAX_INDEX) { 363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t offsetCE32 = Collation::makeCE32FromTagAndIndex(Collation::OFFSET_TAG, index); 367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_setRange32(trie, start, end, offsetCE32, TRUE, &errorCode); 368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modified = TRUE; 369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::setPrimaryRangeAndReturnNext(UChar32 start, UChar32 end, 377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t primary, int32_t step, 378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isCompressible = isCompressiblePrimary(primary); 381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(maybeSetPrimaryRange(start, end, primary, step, errorCode)) { 382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::incThreeBytePrimaryByOffset(primary, isCompressible, 383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (end - start + 1) * step); 384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Short range: Set individual CE32s. 386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, start, Collation::makeLongPrimaryCE32(primary), &errorCode); 388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++start; 389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius primary = Collation::incThreeBytePrimaryByOffset(primary, isCompressible, step); 390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(start > end) { return primary; } 391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 392fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modified = TRUE; 393fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 395fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 396fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 397fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::getCE32FromOffsetCE32(UBool fromBase, UChar32 c, uint32_t ce32) const { 398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = Collation::indexFromCE32(ce32); 399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t dataCE = fromBase ? base->ces[i] : ce64s.elementAti(i); 400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); 401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeLongPrimaryCE32(p); 402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::isCompressibleLeadByte(uint32_t b) const { 406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return base->isCompressibleLeadByte(b); 407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::isAssigned(UChar32 c) const { 411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::isAssignedCE32(utrie2_get32(trie, c)); 412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::getLongPrimaryIfSingleCE(UChar32 c) const { 416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(trie, c); 417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::isLongPrimaryCE32(ce32)) { 418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::primaryFromLongPrimaryCE32(ce32); 419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint64_t 425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::getSingleCE(UChar32 c, UErrorCode &errorCode) const { 426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 427f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius // Keep parallel with CollationData::getSingleCE(). 428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool fromBase = FALSE; 429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(trie, c); 430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == Collation::FALLBACK_CE32) { 431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fromBase = TRUE; 432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = base->getCE32(c); 433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(Collation::isSpecialCE32(ce32)) { 435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius switch(Collation::tagFromCE32(ce32)) { 436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LATIN_EXPANSION_TAG: 437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::BUILDER_DATA_TAG: 438fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::PREFIX_TAG: 439fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::CONTRACTION_TAG: 440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::HANGUL_TAG: 441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LEAD_SURROGATE_TAG: 442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::FALLBACK_TAG: 445fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::RESERVED_TAG_3: 446fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_INTERNAL_PROGRAM_ERROR; 447fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 448fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LONG_PRIMARY_TAG: 449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::ceFromLongPrimaryCE32(ce32); 450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LONG_SECONDARY_TAG: 451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::ceFromLongSecondaryCE32(ce32); 452fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::EXPANSION32_TAG: 453fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::lengthFromCE32(ce32) == 1) { 454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = Collation::indexFromCE32(ce32); 455fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = fromBase ? base->ce32s[i] : ce32s.elementAti(i); 456fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 457fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 458fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 459fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::EXPANSION_TAG: { 462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::lengthFromCE32(ce32) == 1) { 463fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = Collation::indexFromCE32(ce32); 464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return fromBase ? base->ces[i] : ce64s.elementAti(i); 465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 468fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::DIGIT_TAG: 471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Fetch the non-numeric-collation CE32 and continue. 472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = ce32s.elementAti(Collation::indexFromCE32(ce32)); 473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::U0000_TAG: 475fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(c == 0); 476fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Fetch the normal ce32 for U+0000 and continue. 477fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = fromBase ? base->ce32s[0] : ce32s.elementAti(0); 478fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 479fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::OFFSET_TAG: 480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = getCE32FromOffsetCE32(fromBase, c, ce32); 481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::IMPLICIT_TAG: 483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::unassignedCEFromCodePoint(c); 484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::ceFromSimpleCE32(ce32); 487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 488fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 489fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::addCE(int64_t ce, UErrorCode &errorCode) { 491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t length = ce64s.size(); 492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < length; ++i) { 493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce == ce64s.elementAti(i)) { return i; } 494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce64s.addElement(ce, errorCode); 496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return length; 497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::addCE32(uint32_t ce32, UErrorCode &errorCode) { 501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t length = ce32s.size(); 502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < length; ++i) { 503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == (uint32_t)ce32s.elementAti(i)) { return i; } 504fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 505fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32s.addElement((int32_t)ce32, errorCode); 506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return length; 507fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 508fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::addConditionalCE32(const UnicodeString &context, uint32_t ce32, 511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return -1; } 513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!context.isEmpty()); 514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = conditionalCE32s.size(); 515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(index > Collation::MAX_INDEX) { 516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return -1; 518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *cond = new ConditionalCE32(context, ce32); 520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cond == NULL) { 521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MEMORY_ALLOCATION_ERROR; 522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return -1; 523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius conditionalCE32s.addElement(cond, errorCode); 525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::add(const UnicodeString &prefix, const UnicodeString &s, 530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t ces[], int32_t cesLength, 531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = encodeCEs(ces, cesLength, errorCode); 533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addCE32(prefix, s, ce32, errorCode); 534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 536fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 537fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::addCE32(const UnicodeString &prefix, const UnicodeString &s, 538fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32, UErrorCode &errorCode) { 539fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 540fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(s.isEmpty()) { 541fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 542fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 543fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 544fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(trie == NULL || utrie2_isFrozen(trie)) { 545fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_INVALID_STATE_ERROR; 546fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 547fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 548fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = s.char32At(0); 549fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t cLength = U16_LENGTH(c); 550fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t oldCE32 = utrie2_get32(trie, c); 551fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool hasContext = !prefix.isEmpty() || s.length() > cLength; 552fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(oldCE32 == Collation::FALLBACK_CE32) { 553fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // First tailoring for c. 554fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If c has contextual base mappings or if we add a contextual mapping, 555fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // then copy the base mappings. 556fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Otherwise we just override the base mapping. 557fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t baseCE32 = base->getFinalCE32(base->getCE32(c)); 558fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(hasContext || Collation::ce32HasContext(baseCE32)) { 559fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius oldCE32 = copyFromBaseCE32(c, baseCE32, TRUE, errorCode); 560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, oldCE32, &errorCode); 561fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 562fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 563fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 564fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!hasContext) { 565fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No prefix, no contraction. 566fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!isBuilderContextCE32(oldCE32)) { 567fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, ce32, &errorCode); 568fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 569fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *cond = getConditionalCE32ForCE32(oldCE32); 570fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->builtCE32 = Collation::NO_CE32; 571fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->ce32 = ce32; 572fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 573fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 574fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *cond; 575fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!isBuilderContextCE32(oldCE32)) { 576fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Replace the simple oldCE32 with a builder context CE32 577fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // pointing to a new ConditionalCE32 list head. 578fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = addConditionalCE32(UnicodeString((UChar)0), oldCE32, errorCode); 579fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 580fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t contextCE32 = makeBuilderContextCE32(index); 581fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, contextCE32, &errorCode); 582fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contextChars.add(c); 583fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = getConditionalCE32(index); 584fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 585fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = getConditionalCE32ForCE32(oldCE32); 586fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->builtCE32 = Collation::NO_CE32; 587fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 588fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString suffix(s, cLength); 589fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString context((UChar)prefix.length()); 590fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius context.append(prefix).append(suffix); 591fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unsafeBackwardSet.addAll(suffix); 592fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 593fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // invariant: context > cond->context 594fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t next = cond->next; 595fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(next < 0) { 596fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Append a new ConditionalCE32 after cond. 597fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = addConditionalCE32(context, ce32, errorCode); 598fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->next = index; 600fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 601fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 602fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *nextCond = getConditionalCE32(next); 603fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int8_t cmp = context.compare(nextCond->context); 604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cmp < 0) { 605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert a new ConditionalCE32 between cond and nextCond. 606fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = addConditionalCE32(context, ce32, errorCode); 607fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 608fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->next = index; 609fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius getConditionalCE32(index)->next = next; 610fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 611fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(cmp == 0) { 612fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Same context as before, overwrite its ce32. 613fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nextCond->ce32 = ce32; 614fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 615fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 616fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = nextCond; 617fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 618fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 619fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modified = TRUE; 620fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 621fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 623fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::encodeOneCEAsCE32(int64_t ce) { 624fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p = (uint32_t)(ce >> 32); 625fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t lower32 = (uint32_t)ce; 626fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t t = (uint32_t)(ce & 0xffff); 627fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. 628fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((ce & INT64_C(0xffff00ff00ff)) == 0) { 629fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // normal form ppppsstt 630fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return p | (lower32 >> 16) | (t >> 8); 631fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if((ce & INT64_C(0xffffffffff)) == Collation::COMMON_SEC_AND_TER_CE) { 632fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // long-primary form ppppppC1 633fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeLongPrimaryCE32(p); 634fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(p == 0 && (t & 0xff) == 0) { 635fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // long-secondary form ssssttC2 636fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeLongSecondaryCE32(lower32); 637fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 638fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::NO_CE32; 639fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 640fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 641fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 642fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::encodeOneCE(int64_t ce, UErrorCode &errorCode) { 643fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Try to encode one CE as one CE32. 644fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = encodeOneCEAsCE32(ce); 645fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 != Collation::NO_CE32) { return ce32; } 646fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = addCE(ce, errorCode); 647fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 648fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(index > Collation::MAX_INDEX) { 649fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 650fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 651fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 652fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, index, 1); 653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 654fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 655fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 656fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, 657fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 658fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 659fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength < 0 || cesLength > Collation::MAX_EXPANSION_LENGTH) { 660fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 661fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 662fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 663fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(trie == NULL || utrie2_isFrozen(trie)) { 664fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_INVALID_STATE_ERROR; 665fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 666fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 667fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength == 0) { 668fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. 669fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do this here so that callers need not do it. 670fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return encodeOneCEAsCE32(0); 671fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(cesLength == 1) { 672fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return encodeOneCE(ces[0], errorCode); 673fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(cesLength == 2) { 674fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Try to encode two CEs as one CE32. 675fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce0 = ces[0]; 676fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce1 = ces[1]; 677fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p0 = (uint32_t)(ce0 >> 32); 678fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((ce0 & INT64_C(0xffffffffff00ff)) == Collation::COMMON_SECONDARY_CE && 679fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (ce1 & INT64_C(0xffffffff00ffffff)) == Collation::COMMON_TERTIARY_CE && 680fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p0 != 0) { 681fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Latin mini expansion 682fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 683fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p0 | 684fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (((uint32_t)ce0 & 0xff00u) << 8) | 685fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (uint32_t)(ce1 >> 16) | 686fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius Collation::SPECIAL_CE32_LOW_BYTE | 687fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius Collation::LATIN_EXPANSION_TAG; 688fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 689fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 690fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Try to encode two or more CEs as CE32s. 691fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t newCE32s[Collation::MAX_EXPANSION_LENGTH]; 692fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0;; ++i) { 693fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(i == cesLength) { 694fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return encodeExpansion32(newCE32s, cesLength, errorCode); 695fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 696fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = encodeOneCEAsCE32(ces[i]); 697fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == Collation::NO_CE32) { break; } 698fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newCE32s[i] = (int32_t)ce32; 699fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 700fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return encodeExpansion(ces, cesLength, errorCode); 701fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 702fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 703fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 704fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::encodeExpansion(const int64_t ces[], int32_t length, UErrorCode &errorCode) { 705fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 706fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // See if this sequence of CEs has already been stored. 707fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t first = ces[0]; 708fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t ce64sMax = ce64s.size() - length; 709fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i <= ce64sMax; ++i) { 710fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(first == ce64s.elementAti(i)) { 711fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(i > Collation::MAX_INDEX) { 712fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 713fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 714fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 715fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 1;; ++j) { 716fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(j == length) { 717fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeCE32FromTagIndexAndLength( 718fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius Collation::EXPANSION_TAG, i, length); 719fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 720fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce64s.elementAti(i + j) != ces[j]) { break; } 721fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 722fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 723fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 724fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Store the new sequence. 725fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = ce64s.size(); 726fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(i > Collation::MAX_INDEX) { 727fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 728fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 729fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 730fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 0; j < length; ++j) { 731fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce64s.addElement(ces[j], errorCode); 732fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 733fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION_TAG, i, length); 734fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 735fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 736fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 737fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::encodeExpansion32(const int32_t newCE32s[], int32_t length, 738fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 739fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 740fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // See if this sequence of CE32s has already been stored. 741fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t first = newCE32s[0]; 742fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t ce32sMax = ce32s.size() - length; 743fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i <= ce32sMax; ++i) { 744fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(first == ce32s.elementAti(i)) { 745fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(i > Collation::MAX_INDEX) { 746fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 747fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 748fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 749fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 1;; ++j) { 750fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(j == length) { 751fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeCE32FromTagIndexAndLength( 752fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius Collation::EXPANSION32_TAG, i, length); 753fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 754fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32s.elementAti(i + j) != newCE32s[j]) { break; } 755fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 756fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 757fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 758fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Store the new sequence. 759fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = ce32s.size(); 760fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(i > Collation::MAX_INDEX) { 761fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 762fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 763fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 764fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 0; j < length; ++j) { 765fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32s.addElement(newCE32s[j], errorCode); 766fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 767fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeCE32FromTagIndexAndLength(Collation::EXPANSION32_TAG, i, length); 768fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 769fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 770fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 771fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::copyFromBaseCE32(UChar32 c, uint32_t ce32, UBool withContext, 772fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 773fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 774fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!Collation::isSpecialCE32(ce32)) { return ce32; } 775fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius switch(Collation::tagFromCE32(ce32)) { 776fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LONG_PRIMARY_TAG: 777fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LONG_SECONDARY_TAG: 778fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LATIN_EXPANSION_TAG: 779fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // copy as is 780fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 781fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::EXPANSION32_TAG: { 782fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const uint32_t *baseCE32s = base->ce32s + Collation::indexFromCE32(ce32); 783fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t length = Collation::lengthFromCE32(ce32); 784fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = encodeExpansion32( 785fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius reinterpret_cast<const int32_t *>(baseCE32s), length, errorCode); 786fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 787fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 788fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::EXPANSION_TAG: { 789fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t *baseCEs = base->ces + Collation::indexFromCE32(ce32); 790fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t length = Collation::lengthFromCE32(ce32); 791fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = encodeExpansion(baseCEs, length, errorCode); 792fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 793fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 794fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::PREFIX_TAG: { 795fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Flatten prefixes and nested suffixes (contractions) 796fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // into a linear list of ConditionalCE32. 797fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UChar *p = base->contexts + Collation::indexFromCE32(ce32); 798fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = CollationData::readCE32(p); // Default if no prefix match. 799fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!withContext) { 800fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return copyFromBaseCE32(c, ce32, FALSE, errorCode); 801fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 802f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ConditionalCE32 head; 803fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString context((UChar)0); 804fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index; 805fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::isContractionCE32(ce32)) { 806fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); 807fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 808fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); 809fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius head.next = index = addConditionalCE32(context, ce32, errorCode); 810fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 811fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 812fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *cond = getConditionalCE32(index); // the last ConditionalCE32 so far 813fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); 814fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(prefixes.next(errorCode)) { 815fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius context = prefixes.getString(); 816fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius context.reverse(); 817fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius context.insert(0, (UChar)context.length()); 818fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = (uint32_t)prefixes.getValue(); 819fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::isContractionCE32(ce32)) { 820fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = copyContractionsFromBaseCE32(context, c, ce32, cond, errorCode); 821fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 822fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); 823fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->next = index = addConditionalCE32(context, ce32, errorCode); 824fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 825fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 826fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = getConditionalCE32(index); 827fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 828fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = makeBuilderContextCE32(head.next); 829fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contextChars.add(c); 830fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 831fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 832fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::CONTRACTION_TAG: { 833fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!withContext) { 834fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UChar *p = base->contexts + Collation::indexFromCE32(ce32); 835fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = CollationData::readCE32(p); // Default if no suffix match. 836fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return copyFromBaseCE32(c, ce32, FALSE, errorCode); 837fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 838f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ConditionalCE32 head; 839fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString context((UChar)0); 840fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius copyContractionsFromBaseCE32(context, c, ce32, &head, errorCode); 841fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = makeBuilderContextCE32(head.next); 842fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contextChars.add(c); 843fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 844fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 845fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::HANGUL_TAG: 846fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; // We forbid tailoring of Hangul syllables. 847fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 848fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::OFFSET_TAG: 849fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = getCE32FromOffsetCE32(TRUE, c, ce32); 850fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 851fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::IMPLICIT_TAG: 852fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = encodeOneCE(Collation::unassignedCEFromCodePoint(c), errorCode); 853fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 854fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius default: 855fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(FALSE); // require ce32 == base->getFinalCE32(ce32) 856fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 857fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 858fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 859fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 860fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 861fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 862fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::copyContractionsFromBaseCE32(UnicodeString &context, UChar32 c, uint32_t ce32, 863fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *cond, UErrorCode &errorCode) { 864fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 865fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UChar *p = base->contexts + Collation::indexFromCE32(ce32); 866fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index; 867fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 868fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No match on the single code point. 869fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We are underneath a prefix, and the default mapping is just 870fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // a fallback to the mappings for a shorter prefix. 871fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(context.length() > 1); 872fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = -1; 873fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 874fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = CollationData::readCE32(p); // Default if no suffix match. 875fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!Collation::isContractionCE32(ce32)); 876fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); 877fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->next = index = addConditionalCE32(context, ce32, errorCode); 878fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 879fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = getConditionalCE32(index); 880fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 881fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 882fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t suffixStart = context.length(); 883fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); 884fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(suffixes.next(errorCode)) { 885fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius context.append(suffixes.getString()); 886fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = copyFromBaseCE32(c, (uint32_t)suffixes.getValue(), TRUE, errorCode); 887fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->next = index = addConditionalCE32(context, ce32, errorCode); 888fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 889fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No need to update the unsafeBackwardSet because the tailoring set 890fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // is already a copy of the base set. 891fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = getConditionalCE32(index); 892fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius context.truncate(suffixStart); 893fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 894fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(index >= 0); 895fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 896fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 897fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 898fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass CopyHelper { 899fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 900fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CopyHelper(const CollationDataBuilder &s, CollationDataBuilder &d, 901fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationDataBuilder::CEModifier &m, UErrorCode &initialErrorCode) 902fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : src(s), dest(d), modifier(m), 903fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode(initialErrorCode) {} 904fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 905fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool copyRangeCE32(UChar32 start, UChar32 end, uint32_t ce32) { 906fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = copyCE32(ce32); 907fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_setRange32(dest.trie, start, end, ce32, TRUE, &errorCode); 908fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationDataBuilder::isBuilderContextCE32(ce32)) { 909fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dest.contextChars.add(start, end); 910fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 911fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SUCCESS(errorCode); 912fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 913fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 914fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t copyCE32(uint32_t ce32) { 915fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!Collation::isSpecialCE32(ce32)) { 916fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = modifier.modifyCE32(ce32); 917fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce != Collation::NO_CE) { 918fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = dest.encodeOneCE(ce, errorCode); 919fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 920fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 921fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t tag = Collation::tagFromCE32(ce32); 922fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(tag == Collation::EXPANSION32_TAG) { 923fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const uint32_t *srcCE32s = reinterpret_cast<uint32_t *>(src.ce32s.getBuffer()); 924fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius srcCE32s += Collation::indexFromCE32(ce32); 925fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t length = Collation::lengthFromCE32(ce32); 926fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Inspect the source CE32s. Just copy them if none are modified. 927fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Otherwise copy to modifiedCEs, with modifications. 928fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isModified = FALSE; 929fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < length; ++i) { 930fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = srcCE32s[i]; 931fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce; 932fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::isSpecialCE32(ce32) || 933fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (ce = modifier.modifyCE32(ce32)) == Collation::NO_CE) { 934fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isModified) { 935fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modifiedCEs[i] = Collation::ceFromCE32(ce32); 936fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 937fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 938fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!isModified) { 939fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 0; j < i; ++j) { 940fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modifiedCEs[j] = Collation::ceFromCE32(srcCE32s[j]); 941fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 942fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isModified = TRUE; 943fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 944fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modifiedCEs[i] = ce; 945fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 946fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 947fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isModified) { 948fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); 949fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 950fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = dest.encodeExpansion32( 951fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius reinterpret_cast<const int32_t *>(srcCE32s), length, errorCode); 952fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 953fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(tag == Collation::EXPANSION_TAG) { 954fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t *srcCEs = src.ce64s.getBuffer(); 955fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius srcCEs += Collation::indexFromCE32(ce32); 956fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t length = Collation::lengthFromCE32(ce32); 957fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Inspect the source CEs. Just copy them if none are modified. 958fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Otherwise copy to modifiedCEs, with modifications. 959fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isModified = FALSE; 960fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < length; ++i) { 961fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t srcCE = srcCEs[i]; 962fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = modifier.modifyCE(srcCE); 963fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce == Collation::NO_CE) { 964fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isModified) { 965fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modifiedCEs[i] = srcCE; 966fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 967fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 968fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!isModified) { 969fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 0; j < i; ++j) { 970fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modifiedCEs[j] = srcCEs[j]; 971fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 972fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isModified = TRUE; 973fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 974fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modifiedCEs[i] = ce; 975fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 976fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 977fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isModified) { 978fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = dest.encodeCEs(modifiedCEs, length, errorCode); 979fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 980fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = dest.encodeExpansion(srcCEs, length, errorCode); 981fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 982fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(tag == Collation::BUILDER_DATA_TAG) { 983fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Copy the list of ConditionalCE32. 984fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *cond = src.getConditionalCE32ForCE32(ce32); 985fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!cond->hasContext()); 986fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t destIndex = dest.addConditionalCE32( 987fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->context, copyCE32(cond->ce32), errorCode); 988fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = CollationDataBuilder::makeBuilderContextCE32(destIndex); 989fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(cond->next >= 0) { 990fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = src.getConditionalCE32(cond->next); 991fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *prevDestCond = dest.getConditionalCE32(destIndex); 992fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius destIndex = dest.addConditionalCE32( 993fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond->context, copyCE32(cond->ce32), errorCode); 994fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t suffixStart = cond->prefixLength() + 1; 995fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dest.unsafeBackwardSet.addAll(cond->context.tempSubString(suffixStart)); 996fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius prevDestCond->next = destIndex; 997fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 998fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 999fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Just copy long CEs and Latin mini expansions (and other expected values) as is, 1000fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // assuming that the modifier would not modify them. 1001fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(tag == Collation::LONG_PRIMARY_TAG || 1002fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tag == Collation::LONG_SECONDARY_TAG || 1003fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tag == Collation::LATIN_EXPANSION_TAG || 1004fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tag == Collation::HANGUL_TAG); 1005fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1006fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1007fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 1008fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1009fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1010fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationDataBuilder &src; 1011fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationDataBuilder &dest; 1012fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationDataBuilder::CEModifier &modifier; 1013fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t modifiedCEs[Collation::MAX_EXPANSION_LENGTH]; 1014fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode errorCode; 1015fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 1016fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1017fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CDECL_BEGIN 1018fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1019fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic UBool U_CALLCONV 1020fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusenumRangeForCopy(const void *context, UChar32 start, UChar32 end, uint32_t value) { 1021fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 1022fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius value == Collation::UNASSIGNED_CE32 || value == Collation::FALLBACK_CE32 || 1023fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ((CopyHelper *)context)->copyRangeCE32(start, end, value); 1024fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1025fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1026fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CDECL_END 1027fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1028fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1029fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::copyFrom(const CollationDataBuilder &src, const CEModifier &modifier, 1030fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1031fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1032fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(trie == NULL || utrie2_isFrozen(trie)) { 1033fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_INVALID_STATE_ERROR; 1034fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1035fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1036fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CopyHelper helper(src, *this, modifier, errorCode); 1037fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_enum(src.trie, NULL, enumRangeForCopy, &helper); 1038fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = helper.errorCode; 1039fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Update the contextChars and the unsafeBackwardSet while copying, 1040fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // in case a character had conditional mappings in the source builder 1041fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and they were removed later. 1042fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modified |= src.modified; 1043fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1044fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1045fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1046fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::optimize(const UnicodeSet &set, UErrorCode &errorCode) { 1047fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode) || set.isEmpty()) { return; } 1048fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(set); 1049fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(iter.next() && !iter.isString()) { 1050fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = iter.getCodepoint(); 1051fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(trie, c); 1052fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == Collation::FALLBACK_CE32) { 1053fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = base->getFinalCE32(base->getCE32(c)); 1054fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = copyFromBaseCE32(c, ce32, TRUE, errorCode); 1055fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, ce32, &errorCode); 1056fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1057fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1058fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modified = TRUE; 1059fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1060fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1061fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1062fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::suppressContractions(const UnicodeSet &set, UErrorCode &errorCode) { 1063fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode) || set.isEmpty()) { return; } 1064fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(set); 1065fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(iter.next() && !iter.isString()) { 1066fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = iter.getCodepoint(); 1067fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(trie, c); 1068fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == Collation::FALLBACK_CE32) { 1069fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = base->getFinalCE32(base->getCE32(c)); 1070fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::ce32HasContext(ce32)) { 1071fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = copyFromBaseCE32(c, ce32, FALSE /* without context */, errorCode); 1072fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, ce32, &errorCode); 1073fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1074fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(isBuilderContextCE32(ce32)) { 1075fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = getConditionalCE32ForCE32(ce32)->ce32; 1076fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Simply abandon the list of ConditionalCE32. 1077fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The caller will copy this builder in the end, 1078fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // eliminating unreachable data. 1079fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, ce32, &errorCode); 1080fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contextChars.remove(c); 1081fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1082fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1083fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius modified = TRUE; 1084fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1085fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1086fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1087fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::getJamoCE32s(uint32_t jamoCE32s[], UErrorCode &errorCode) { 1088fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return FALSE; } 1089fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool anyJamoAssigned = base == NULL; // always set jamoCE32s in the base data 1090fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool needToCopyFromBase = FALSE; 1091fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 1092fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 jamo = jamoCpFromIndex(j); 1093fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool fromBase = FALSE; 1094fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(trie, jamo); 1095fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius anyJamoAssigned |= Collation::isAssignedCE32(ce32); 1096fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. 1097fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) 1098fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == Collation::FALLBACK_CE32) { 1099fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fromBase = TRUE; 1100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = base->getCE32(jamo); 1101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::isSpecialCE32(ce32)) { 1103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius switch(Collation::tagFromCE32(ce32)) { 1104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LONG_PRIMARY_TAG: 1105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LONG_SECONDARY_TAG: 1106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LATIN_EXPANSION_TAG: 1107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Copy the ce32 as-is. 1108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::EXPANSION32_TAG: 1110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::EXPANSION_TAG: 1111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::PREFIX_TAG: 1112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::CONTRACTION_TAG: 1113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fromBase) { 1114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Defer copying until we know if anyJamoAssigned. 1115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = Collation::FALLBACK_CE32; 1116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius needToCopyFromBase = TRUE; 1117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::IMPLICIT_TAG: 1120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // An unassigned Jamo should only occur in tests with incomplete bases. 1121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(fromBase); 1122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = Collation::FALLBACK_CE32; 1123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius needToCopyFromBase = TRUE; 1124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::OFFSET_TAG: 1126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); 1127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::FALLBACK_TAG: 1129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::RESERVED_TAG_3: 1130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::BUILDER_DATA_TAG: 1131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::DIGIT_TAG: 1132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::U0000_TAG: 1133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::HANGUL_TAG: 1134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case Collation::LEAD_SURROGATE_TAG: 1135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_INTERNAL_PROGRAM_ERROR; 1136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius jamoCE32s[j] = ce32; 1140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(anyJamoAssigned && needToCopyFromBase) { 1142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t j = 0; j < CollationData::JAMO_CE32S_LENGTH; ++j) { 1143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(jamoCE32s[j] == Collation::FALLBACK_CE32) { 1144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 jamo = jamoCpFromIndex(j); 1145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius jamoCE32s[j] = copyFromBaseCE32(jamo, base->getCE32(jamo), 1146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /*withContext=*/ TRUE, errorCode); 1147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return anyJamoAssigned && U_SUCCESS(errorCode); 1151fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::setDigitTags(UErrorCode &errorCode) { 1155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet digits(UNICODE_STRING_SIMPLE("[:Nd:]"), errorCode); 1156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(digits); 1158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(iter.next()) { 1159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!iter.isString()); 1160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = iter.getCodepoint(); 1161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(trie, c); 1162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 != Collation::FALLBACK_CE32 && ce32 != Collation::UNASSIGNED_CE32) { 1163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = addCE32(ce32, errorCode); 1164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(index > Collation::MAX_INDEX) { 1166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = Collation::makeCE32FromTagIndexAndLength( 1170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius Collation::DIGIT_TAG, index, u_charDigitValue(c)); 1171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, ce32, &errorCode); 1172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CDECL_BEGIN 1177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic UBool U_CALLCONV 1179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusenumRangeLeadValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) { 1180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t *pValue = (int32_t *)context; 1181fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(value == Collation::UNASSIGNED_CE32) { 1182fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius value = Collation::LEAD_ALL_UNASSIGNED; 1183fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(value == Collation::FALLBACK_CE32) { 1184fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius value = Collation::LEAD_ALL_FALLBACK; 1185fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1186fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *pValue = Collation::LEAD_MIXED; 1187fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1188fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(*pValue < 0) { 1190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *pValue = (int32_t)value; 1191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(*pValue != (int32_t)value) { 1192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *pValue = Collation::LEAD_MIXED; 1193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1194fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1195fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 1196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CDECL_END 1199fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1200fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::setLeadSurrogates(UErrorCode &errorCode) { 1202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(UChar lead = 0xd800; lead < 0xdc00; ++lead) { 1203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t value = -1; 1204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_enumForLeadSurrogate(trie, lead, NULL, enumRangeLeadValue, &value); 1205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32ForLeadSurrogateCodeUnit( 1206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius trie, lead, 1207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius Collation::makeCE32FromTagAndIndex(Collation::LEAD_SURROGATE_TAG, 0) | (uint32_t)value, 1208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius &errorCode); 1209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::build(CollationData &data, UErrorCode &errorCode) { 1214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius buildMappings(data, errorCode); 1215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(base != NULL) { 1216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.numericPrimary = base->numericPrimary; 1217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.compressibleBytes = base->compressibleBytes; 12181b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert data.numScripts = base->numScripts; 12191b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert data.scriptsIndex = base->scriptsIndex; 12201b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert data.scriptStarts = base->scriptStarts; 12211b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert data.scriptStartsLength = base->scriptStartsLength; 1222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius buildFastLatinTable(data, errorCode); 1224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::buildMappings(CollationData &data, UErrorCode &errorCode) { 1228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(trie == NULL || utrie2_isFrozen(trie)) { 1230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_INVALID_STATE_ERROR; 1231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius buildContexts(errorCode); 1235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t jamoCE32s[CollationData::JAMO_CE32S_LENGTH]; 1237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t jamoIndex = -1; 1238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(getJamoCE32s(jamoCE32s, errorCode)) { 1239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius jamoIndex = ce32s.size(); 1240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < CollationData::JAMO_CE32S_LENGTH; ++i) { 1241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32s.addElement((int32_t)jamoCE32s[i], errorCode); 1242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Small optimization: Use a bit in the Hangul ce32 1244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // to indicate that none of the Jamo CE32s are isSpecialCE32() 1245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (as it should be in the root collator). 1246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. 1247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // In order to still have good trie compression and keep this code simple, 1248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // we only set this flag if a whole block of 588 Hangul syllables starting with 1249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // a common leading consonant (Jamo L) has this property. 1250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isAnyJamoVTSpecial = FALSE; 1251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = Hangul::JAMO_L_COUNT; i < CollationData::JAMO_CE32S_LENGTH; ++i) { 1252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Collation::isSpecialCE32(jamoCE32s[i])) { 1253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isAnyJamoVTSpecial = TRUE; 1254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); 1258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = Hangul::HANGUL_BASE; 1259fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < Hangul::JAMO_L_COUNT; ++i) { // iterate over the Jamo L 1260fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = hangulCE32; 1261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!isAnyJamoVTSpecial && !Collation::isSpecialCE32(jamoCE32s[i])) { 1262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 |= Collation::HANGUL_NO_SPECIAL_JAMO; 1263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 limit = c + Hangul::JAMO_VT_COUNT; 1265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); 1266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = limit; 1267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Copy the Hangul CE32s from the base in blocks per Jamo L, 1270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. 1271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(UChar32 c = Hangul::HANGUL_BASE; c < Hangul::HANGUL_LIMIT;) { 1272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = base->getCE32(c); 1273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(Collation::hasCE32Tag(ce32, Collation::HANGUL_TAG)); 1274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 limit = c + Hangul::JAMO_VT_COUNT; 1275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_setRange32(trie, c, limit - 1, ce32, TRUE, &errorCode); 1276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = limit; 1277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius setDigitTags(errorCode); 1281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius setLeadSurrogates(errorCode); 1282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. 1284fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32s.setElementAt((int32_t)utrie2_get32(trie, 0), 0); 1285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, 0, Collation::makeCE32FromTagAndIndex(Collation::U0000_TAG, 0), &errorCode); 1286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_freeze(trie, UTRIE2_32_VALUE_BITS, &errorCode); 1288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Mark each lead surrogate as "unsafe" 1291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // if any of its 1024 associated supplementary code points is "unsafe". 1292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = 0x10000; 1293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 1294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { 1295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unsafeBackwardSet.add(lead); 1296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unsafeBackwardSet.freeze(); 1299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.trie = trie; 1301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.ce32s = reinterpret_cast<const uint32_t *>(ce32s.getBuffer()); 1302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.ces = ce64s.getBuffer(); 1303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.contexts = contexts.getBuffer(); 1304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.ce32sLength = ce32s.size(); 1306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.cesLength = ce64s.size(); 1307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.contextsLength = contexts.length(); 1308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.base = base; 1310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(jamoIndex >= 0) { 1311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.jamoCE32s = data.ce32s + jamoIndex; 1312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.jamoCE32s = base->jamoCE32s; 1314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.unsafeBackwardSet = &unsafeBackwardSet; 1316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::clearContexts() { 1320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contexts.remove(); 1321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(contextChars); 1322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(iter.next()) { 1323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!iter.isString()); 1324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(trie, iter.getCodepoint()); 1325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isBuilderContextCE32(ce32)); 1326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius getConditionalCE32ForCE32(ce32)->builtCE32 = Collation::NO_CE32; 1327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::buildContexts(UErrorCode &errorCode) { 1332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Ignore abandoned lists and the cached builtCE32, 1334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and build all contexts from scratch. 1335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contexts.remove(); 1336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(contextChars); 1337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(U_SUCCESS(errorCode) && iter.next()) { 1338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!iter.isString()); 1339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = iter.getCodepoint(); 1340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = utrie2_get32(trie, c); 1341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!isBuilderContextCE32(ce32)) { 1342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Impossible: No context data for c in contextChars. 1343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_INTERNAL_PROGRAM_ERROR; 1344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *cond = getConditionalCE32ForCE32(ce32); 1347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = buildContext(cond, errorCode); 1348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius utrie2_set32(trie, c, ce32, &errorCode); 1349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::buildContext(ConditionalCE32 *head, UErrorCode &errorCode) { 1354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 1355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The list head must have no context. 1356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!head->hasContext()); 1357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The list head must be followed by one or more nodes that all do have context. 1358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(head->next >= 0); 1359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCharsTrieBuilder prefixBuilder(errorCode); 1360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCharsTrieBuilder contractionBuilder(errorCode); 1361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(ConditionalCE32 *cond = head;; cond = getConditionalCE32(cond->next)) { 1362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // After the list head, the prefix or suffix can be empty, but not both. 1363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(cond == head || cond->hasContext()); 1364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t prefixLength = cond->prefixLength(); 1365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString prefix(cond->context, 0, prefixLength + 1); 1366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Collect all contraction suffixes for one prefix. 1367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *firstCond = cond; 1368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ConditionalCE32 *lastCond = cond; 1369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(cond->next >= 0 && 1370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (cond = getConditionalCE32(cond->next))->context.startsWith(prefix)) { 1371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius lastCond = cond; 1372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32; 1374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t suffixStart = prefixLength + 1; // == prefix.length() 1375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(lastCond->context.length() == suffixStart) { 1376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // One prefix without contraction suffix. 1377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(firstCond == lastCond); 1378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = lastCond->ce32; 1379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = lastCond; 1380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Build the contractions trie. 1382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contractionBuilder.clear(); 1383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Entry for an empty suffix, to be stored before the trie. 13841b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert uint32_t emptySuffixCE32 = 0; 1385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t flags = 0; 1386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(firstCond->context.length() == suffixStart) { 1387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There is a mapping for the prefix and the single character c. (p|c) 1388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If no other suffix matches, then we return this value. 1389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius emptySuffixCE32 = firstCond->ce32; 1390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = getConditionalCE32(firstCond->next); 1391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1392fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There is no mapping for the prefix and just the single character. 1393fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (There is no p|c, only p|cd, p|ce etc.) 1394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius flags |= Collation::CONTRACT_SINGLE_CP_NO_MATCH; 1395fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // When the prefix matches but none of the prefix-specific suffixes, 1396fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // then we fall back to the mappings with the next-longest prefix, 1397fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and ultimately to mappings with no prefix. 1398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Each fallback might be another set of contractions. 1399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, 1400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // then in text "pch" we find the ch contraction. 1401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(cond = head;; cond = getConditionalCE32(cond->next)) { 1402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t length = cond->prefixLength(); 1403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(length == prefixLength) { break; } 1404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cond->defaultCE32 != Collation::NO_CE32 && 1405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (length==0 || prefix.endsWith(cond->context, 1, length))) { 1406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius emptySuffixCE32 = cond->defaultCE32; 1407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = firstCond; 1410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Optimization: Set a flag when 1412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // the first character of every contraction suffix has lccc!=0. 1413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Short-circuits contraction matching when a normal letter follows. 1414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius flags |= Collation::CONTRACT_NEXT_CCC; 1415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Add all of the non-empty suffixes into the contraction trie. 1416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString suffix(cond->context, suffixStart); 1418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint16_t fcd16 = nfcImpl.getFCD16(suffix.char32At(0)); 1419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fcd16 <= 0xff) { 1420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius flags &= ~Collation::CONTRACT_NEXT_CCC; 1421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fcd16 = nfcImpl.getFCD16(suffix.char32At(suffix.length() - 1)); 1423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fcd16 > 0xff) { 1424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The last suffix character has lccc!=0, allowing for discontiguous contractions. 1425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius flags |= Collation::CONTRACT_TRAILING_CCC; 1426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contractionBuilder.add(suffix, (int32_t)cond->ce32, errorCode); 1428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cond == lastCond) { break; } 1429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cond = getConditionalCE32(cond->next); 1430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = addContextTrie(emptySuffixCE32, contractionBuilder, errorCode); 1432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 1433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(index > Collation::MAX_INDEX) { 1434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 1436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = Collation::makeCE32FromTagAndIndex(Collation::CONTRACTION_TAG, index) | flags; 1438fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1439fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(cond == lastCond); 1440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius firstCond->defaultCE32 = ce32; 1441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(prefixLength == 0) { 1442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cond->next < 0) { 1443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No non-empty prefixes, only contractions. 1444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 1445fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1446fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1447fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius prefix.remove(0, 1); // Remove the length unit. 1448fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius prefix.reverse(); 1449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius prefixBuilder.add(prefix, (int32_t)ce32, errorCode); 1450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cond->next < 0) { break; } 1451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1452fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1453fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(head->defaultCE32 != Collation::NO_CE32); 1454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = addContextTrie(head->defaultCE32, prefixBuilder, errorCode); 1455fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 1456fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(index > Collation::MAX_INDEX) { 1457fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1458fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 1459fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::makeCE32FromTagAndIndex(Collation::PREFIX_TAG, index); 1461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1463fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 1464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::addContextTrie(uint32_t defaultCE32, UCharsTrieBuilder &trieBuilder, 1465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString context; 1467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius context.append((UChar)(defaultCE32 >> 16)).append((UChar)defaultCE32); 1468fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString trieString; 1469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius context.append(trieBuilder.buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieString, errorCode)); 1470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return -1; } 1471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = contexts.indexOf(context); 1472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(index < 0) { 1473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = contexts.length(); 1474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contexts.append(context); 1475fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1476fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 1477fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1478fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1479fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::buildFastLatinTable(CollationData &data, UErrorCode &errorCode) { 1481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode) || !fastLatinEnabled) { return; } 1482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete fastLatinBuilder; 1484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fastLatinBuilder = new CollationFastLatinBuilder(errorCode); 1485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fastLatinBuilder == NULL) { 1486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MEMORY_ALLOCATION_ERROR; 1487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1488fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1489fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fastLatinBuilder->forData(data, errorCode)) { 1490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const uint16_t *table = fastLatinBuilder->getTable(); 1491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t length = fastLatinBuilder->lengthOfTable(); 1492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(base != NULL && length == base->fastLatinTableLength && 1493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uprv_memcmp(table, base->fastLatinTable, length * 2) == 0) { 1494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Same fast Latin table as in the base, use that one instead. 1495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete fastLatinBuilder; 1496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fastLatinBuilder = NULL; 1497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius table = base->fastLatinTable; 1498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.fastLatinTable = table; 1500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius data.fastLatinTableLength = length; 1501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete fastLatinBuilder; 1503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fastLatinBuilder = NULL; 1504fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1505fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1507fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 1508fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::getCEs(const UnicodeString &s, int64_t ces[], int32_t cesLength) { 1509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return getCEs(s, 0, ces, cesLength); 1510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 1513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::getCEs(const UnicodeString &prefix, const UnicodeString &s, 1514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ces[], int32_t cesLength) { 1515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t prefixLength = prefix.length(); 1516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(prefixLength == 0) { 1517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return getCEs(s, 0, ces, cesLength); 1518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return getCEs(prefix + s, prefixLength, ces, cesLength); 1520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 1524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationDataBuilder::getCEs(const UnicodeString &s, int32_t start, 1525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ces[], int32_t cesLength) { 1526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(collIter == NULL) { 1527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius collIter = new DataBuilderCollationIterator(*this); 1528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(collIter == NULL) { return 0; } 1529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return collIter->fetchCEs(s, start, ces, cesLength); 1531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END 1534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif // !UCONFIG_NO_COLLATION 1536