1fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/* 2fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Copyright (C) 2013-2014, International Business Machines 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Corporation and others. All Rights Reserved. 5fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* collationbuilder.cpp 7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* (replaced the former ucol_bld.cpp) 9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 10fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created on: 2013may06 11fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created by: Markus W. Scherer 12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/ 13fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 15fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include <stdio.h> 16fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 17fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utypes.h" 19fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 20fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#if !UCONFIG_NO_COLLATION 21fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 22fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/caniter.h" 23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/normalizer2.h" 24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/tblcoll.h" 25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/parseerr.h" 26fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uchar.h" 27fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/ucol.h" 28fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/unistr.h" 29fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/usetiter.h" 30fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utf16.h" 31fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uversion.h" 32fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "cmemory.h" 33fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collation.h" 34fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationbuilder.h" 35fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdata.h" 36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdatabuilder.h" 37fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationfastlatin.h" 38fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationroot.h" 39fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationrootelements.h" 40fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationruleparser.h" 41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationsettings.h" 42fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationtailoring.h" 43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationweights.h" 44fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "normalizer2impl.h" 45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "uassert.h" 46fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "ucol_imp.h" 47fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "utf16collationiterator.h" 48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 49fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 50fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN 52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace { 54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 55fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass BundleImporter : public CollationRuleParser::Importer { 56fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 57fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius BundleImporter() : rules(NULL) {} 58fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual ~BundleImporter(); 59fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual const UnicodeString *getRules( 60fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *localeID, const char *collationType, 61fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&errorReason, UErrorCode &errorCode); 62fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 63fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate: 64fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString *rules; 65fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 66fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 67fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusBundleImporter::~BundleImporter() { 68fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete rules; 69fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 70fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 71fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusconst UnicodeString * 72fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusBundleImporter::getRules( 73fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *localeID, const char *collationType, 74fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *& /*errorReason*/, UErrorCode &errorCode) { 75fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete rules; 76fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return rules = CollationLoader::loadRules(localeID, collationType, errorCode); 77fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 78fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 79fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} // namespace 80fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 81fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// RuleBasedCollator implementation ---------------------------------------- *** 82fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 83fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// These methods are here, rather than in rulebasedcollator.cpp, 84fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// for modularization: 85fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// Most code using Collator does not need to build a Collator from rules. 86fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// By moving these constructors and helper methods to a separate file, 87fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// most code will not have a static dependency on the builder code. 88fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 89fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator() 90fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 91fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 92fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 93fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 94fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 95fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 96fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 97fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 98fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode) 99fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode); 106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength, 109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) 110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode); 117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UColAttributeValue decompositionMode, 121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) 122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode); 129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ECollationStrength strength, 133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UColAttributeValue decompositionMode, 134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) 135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode); 142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UParseError &parseError, UnicodeString &reason, 146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) 147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 151fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode); 154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::internalBuildTailoring(const UnicodeString &rules, 158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t strength, 159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UColAttributeValue decompositionMode, 160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UParseError *outParseError, UnicodeString *outReason, 161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationTailoring *base = CollationRoot::getRoot(errorCode); 163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(outReason != NULL) { outReason->remove(); } 165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationBuilder builder(base, errorCode); 166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UVersionInfo noVersion = { 0, 0, 0, 0 }; 167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius BundleImporter importer; 168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion, 169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius &importer, 170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius outParseError, errorCode)); 171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *reason = builder.getErrorReason(); 173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(reason != NULL && outReason != NULL) { 174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *outReason = UnicodeString(reason, -1, US_INV); 175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t->actualLocale.setToBogus(); 179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius adoptTailoring(t.orphan()); 180dbc22bd174be483711cea006f3189d8289835830ccornelius // Set attributes after building the collator, 181dbc22bd174be483711cea006f3189d8289835830ccornelius // to keep the default settings consistent with the rule string. 182dbc22bd174be483711cea006f3189d8289835830ccornelius if(strength != UCOL_DEFAULT) { 183dbc22bd174be483711cea006f3189d8289835830ccornelius setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode); 184dbc22bd174be483711cea006f3189d8289835830ccornelius } 185dbc22bd174be483711cea006f3189d8289835830ccornelius if(decompositionMode != UCOL_DEFAULT) { 186dbc22bd174be483711cea006f3189d8289835830ccornelius setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode); 187dbc22bd174be483711cea006f3189d8289835830ccornelius } 188fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// CollationBuilder implementation ----------------------------------------- *** 191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode) 193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : nfd(*Normalizer2::getNFDInstance(errorCode)), 194fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fcd(*Normalizer2Factory::getFCDInstance(errorCode)), 195fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), 196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius base(b), 197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius baseData(b->data), 198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootElements(b->data->rootElements, b->data->rootElementsLength), 199fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius variableTop(0), 200fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE), 201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason(NULL), 202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength(0), 203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootPrimaryIndexes(errorCode), nodes(errorCode) { 204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfcImpl.ensureCanonIterData(errorCode); 205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "CollationBuilder fields initialization failed"; 207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(dataBuilder == NULL) { 210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MEMORY_ALLOCATION_ERROR; 211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->initForTailoring(baseData, errorCode); 214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "CollationBuilder initialization failed"; 216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::~CollationBuilder() { 220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete dataBuilder; 221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationTailoring * 224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::parseAndBuild(const UnicodeString &ruleString, 225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UVersionInfo rulesVersion, 226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationRuleParser::Importer *importer, 227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UParseError *outParseError, 228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(baseData->rootElements == NULL) { 231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MISSING_RESOURCE_ERROR; 232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "missing root elements data, tailoring not supported"; 233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings)); 236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(tailoring.isNull() || tailoring->isBogus()) { 237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MEMORY_ALLOCATION_ERROR; 238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationRuleParser parser(baseData, errorCode); 241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: This always bases &[last variable] and &[first regular] 243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // on the root collator's maxVariable/variableTop. 244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If we wanted this to change after [maxVariable x], then we would keep 245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // the tailoring.settings pointer here and read its variableTop when we need it. 246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // See http://unicode.org/cldr/trac/ticket/6070 247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius variableTop = base->settings->variableTop; 248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parser.setSink(this); 249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parser.setImporter(importer); 250dbc22bd174be483711cea006f3189d8289835830ccornelius CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings); 251dbc22bd174be483711cea006f3189d8289835830ccornelius parser.parse(ruleString, ownedSettings, outParseError, errorCode); 252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = parser.getErrorReason(); 253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(dataBuilder->hasMappings()) { 255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius makeTailoredCEs(errorCode); 256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius closeOverComposites(errorCode); 257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius finalizeCEs(errorCode); 258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Copy all of ASCII, and Latin-1 letters, into each tailoring. 259fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius optimizeSet.add(0, 0x7f); 260fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius optimizeSet.add(0xc0, 0xff); 261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Hangul is decomposed on the fly during collation, 262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and the tailoring data is always built with HANGUL_TAG specials. 263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->optimize(optimizeSet, errorCode); 265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->ensureOwnedData(errorCode); 266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } 268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->build(*tailoring->ownedData, errorCode); 269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->builder = dataBuilder; 270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder = NULL; 271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->data = baseData; 273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 275dbc22bd174be483711cea006f3189d8289835830ccornelius ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( 276dbc22bd174be483711cea006f3189d8289835830ccornelius tailoring->data, ownedSettings, 277dbc22bd174be483711cea006f3189d8289835830ccornelius ownedSettings.fastLatinPrimaries, LENGTHOF(ownedSettings.fastLatinPrimaries)); 278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->rules = ruleString; 279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination 280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->setVersion(base->version, rulesVersion); 281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return tailoring.orphan(); 282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 284fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addReset(int32_t strength, const UnicodeString &str, 286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&parserErrorReason, UErrorCode &errorCode) { 287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!str.isEmpty()); 289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(str.charAt(0) == CollationRuleParser::POS_LEAD) { 290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode); 291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = 1; 292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0); 294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // normal reset to a character or string 296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdString = nfd.normalize(str, errorCode); 297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "normalizing the reset position"; 299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = dataBuilder->getCEs(nfdString, ces, 0); 302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset position maps to too many collation elements (more than 31)"; 305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position 309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // &[before strength]position 311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY); 312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If the index is for a "weaker" tailored node, 317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // then skip backwards over this and further "weaker" nodes. 318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(strengthFromNode(node) > strength) { 319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = previousIndexFromNode(node); 320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find or insert a node whose index we will put into a temporary CE. 324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == strength && isTailoredNode(node)) { 325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Reset to just before this same-strength tailored node. 326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = previousIndexFromNode(node); 327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(strength == UCOL_PRIMARY) { 328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // root primary node (has no previous index) 329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p = weight32FromNode(node); 330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(p == 0) { 331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset primary-before ignorable not possible"; 333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(p <= rootElements.getFirstPrimary()) { 336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There is no primary gap between ignorables and the space-first-primary. 337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset primary-before first non-ignorable not supported"; 339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(p == Collation::FIRST_TRAILING_PRIMARY) { 342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do not support tailoring to an unassigned-implicit CE. 343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset primary-before [first trailing] not supported"; 345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p)); 348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findOrInsertNodeForPrimary(p, errorCode); 349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Go to the last node in this list: 350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Tailor after the last node between adjacent root nodes. 351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex = nextIndexFromNode(node); 354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextIndex == 0) { break; } 355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndex; 356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // &[before 2] or &[before 3] 359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findCommonNode(index, UCOL_SECONDARY); 360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_TERTIARY) { 361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findCommonNode(index, UCOL_TERTIARY); 362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == strength) { 365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Found a same-strength node with an explicit weight. 366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t weight16 = weight16FromNode(node); 367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(weight16 == 0) { 368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY) { 370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset secondary-before secondary ignorable not possible"; 371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset tertiary-before completely ignorable not possible"; 373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16); 377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t previousIndex = previousIndexFromNode(node); 378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(weight16 == Collation::COMMON_WEIGHT16) { 379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Reset to just before this same-strength common-weight node. 380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = previousIndex; 381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // A non-common weight is only possible from a root CE. 383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the higher-level weights, which must all be explicit, 384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and then find the preceding weight for this level. 385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t previousWeight16 = 0; 386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t previousWeightIndex = -1; 387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = index; 388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY) { 389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p; 390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius do { 391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius i = previousIndexFromNode(node); 392fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(i); 393fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == UCOL_SECONDARY && !isTailoredNode(node) && 394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius previousWeightIndex < 0) { 395fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius previousWeightIndex = i; 396fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius previousWeight16 = weight16FromNode(node); 397fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } while(strengthFromNode(node) > UCOL_PRIMARY); 399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!isTailoredNode(node)); 400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p = weight32FromNode(node); 401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius weight16 = rootElements.getSecondaryBefore(p, weight16); 402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p, s; 404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius do { 405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius i = previousIndexFromNode(node); 406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(i); 407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == UCOL_TERTIARY && !isTailoredNode(node) && 408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius previousWeightIndex < 0) { 409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius previousWeightIndex = i; 410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius previousWeight16 = weight16FromNode(node); 411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } while(strengthFromNode(node) > UCOL_SECONDARY); 413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!isTailoredNode(node)); 414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == UCOL_SECONDARY) { 415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = weight16FromNode(node); 416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius do { 417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius i = previousIndexFromNode(node); 418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(i); 419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } while(strengthFromNode(node) > UCOL_PRIMARY); 420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!isTailoredNode(node)); 421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!nodeHasBefore2(node)); 423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = Collation::COMMON_WEIGHT16; 424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p = weight32FromNode(node); 426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius weight16 = rootElements.getTertiaryBefore(p, s, weight16); 427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0); 428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find or insert the new explicit weight before the current one. 430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(previousWeightIndex >= 0 && weight16 == previousWeight16) { 431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Tailor after the last node between adjacent root nodes. 432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = previousIndex; 433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = insertNodeBetween(previousIndex, index, node, errorCode); 436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 438fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 439fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Found a stronger node with implied strength-common weight. 440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t hasBefore3 = 0; 441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY) { 442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!nodeHasBefore2(node)); 443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Move the HAS_BEFORE3 flag from the parent node 444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // to the new secondary common node. 445fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius hasBefore3 = node & HAS_BEFORE3; 446fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2; 447fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 448fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!nodeHasBefore3(node)); 449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node |= HAS_BEFORE3; 450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.setElementAt(node, index); 452fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex = nextIndexFromNode(node); 453fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert default nodes with weights 02 and 05, reset to the 02 node. 454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength); 455fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = insertNodeBetween(index, nextIndex, node, errorCode); 456fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 | 457fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodeFromStrength(strength); 458fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius insertNodeBetween(index, nextIndex, node, errorCode); 459fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Strength of the temporary CE = strength of its reset position. 461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Code above raises an error if the before-strength is stronger. 462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = ceStrength(ces[cesLength - 1]); 463fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "inserting reset position for &[before n]"; 466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 468fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint64_t 472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::getSpecialResetPosition(const UnicodeString &str, 473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&parserErrorReason, UErrorCode &errorCode) { 474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(str.length() == 2); 475fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce; 476fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t strength = UCOL_PRIMARY; 477fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isBoundary = FALSE; 478fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE; 479fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING); 480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius switch(pos) { 481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_TERTIARY_IGNORABLE: 482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Quaternary CEs are not supported. 483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_TERTIARY_IGNORABLE: 486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { 488fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Look for a tailored tertiary node after [0, 0, 0]. 489fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode); 490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((index = nextIndexFromNode(node)) != 0) { 493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY); 495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) { 496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return tempCEFromIndexAndStrength(index, UCOL_TERTIARY); 497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return rootElements.getFirstTertiaryCE(); 500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No need to look for nodeHasAnyBefore() on a tertiary node. 501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_SECONDARY_IGNORABLE: 503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.getLastTertiaryCE(); 504fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = UCOL_TERTIARY; 505fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { 507fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Look for a tailored secondary node after [0, 0, *]. 508fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode); 509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((index = nextIndexFromNode(node)) != 0) { 512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = strengthFromNode(node); 514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength < UCOL_SECONDARY) { break; } 515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY) { 516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nodeHasBefore3(node)) { 518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(nodes.elementAti(index))); 520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return tempCEFromIndexAndStrength(index, UCOL_SECONDARY); 522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.getFirstSecondaryCE(); 528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = UCOL_SECONDARY; 529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_PRIMARY_IGNORABLE: 532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.getLastSecondaryCE(); 533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = UCOL_SECONDARY; 534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_VARIABLE: 536fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.getFirstPrimaryCE(); 537fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isBoundary = TRUE; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 538fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 539fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_VARIABLE: 540fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 541fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 542fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_REGULAR: 543fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 544fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isBoundary = TRUE; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 545fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 546fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_REGULAR: 547fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Use the Hani-first-primary rather than the actual last "regular" CE before it, 548fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // for backward compatibility with behavior before the introduction of 549fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // script-first-primary CEs in the root collator. 550fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.firstCEWithPrimaryAtLeast( 551fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius baseData->getFirstPrimaryForGroup(USCRIPT_HAN)); 552fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 553fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_IMPLICIT: { 554fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = baseData->getCE32(0x4e00); 555fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(Collation::hasCE32Tag(ce32, Collation::OFFSET_TAG)); 556fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = baseData->getCEFromOffsetCE32(0x4e00, ce32); 557fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 558fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 559fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_IMPLICIT: 560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do not support tailoring to an unassigned-implicit CE. 561fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 562fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset to [last implicit] not supported"; 563fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 564fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_TRAILING: 565fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); 566fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isBoundary = TRUE; // trailing first primary (there is no mapping for it) 567fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 568fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_TRAILING: 569fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 570fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "LDML forbids tailoring to U+FFFF"; 571fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 572fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius default: 573fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(FALSE); 574fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 575fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 576fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 577fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode); 578fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 579fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 580fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((pos & 1) == 0) { 581fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // even pos = [first xyz] 582fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!nodeHasAnyBefore(node) && isBoundary) { 583fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // A <group> first primary boundary is artificially added to FractionalUCA.txt. 584fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It is reachable via its special contraction, but is not normally used. 585fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the first character tailored after the boundary CE, 586fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // or the first real root CE after it. 587fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((index = nextIndexFromNode(node)) != 0) { 588fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If there is a following node, then it must be tailored 589fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // because there are no root CEs with a boundary primary 590fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and non-common secondary/tertiary weights. 591fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 592fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(node)); 593fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = tempCEFromIndexAndStrength(index, strength); 594fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 595fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(strength == UCOL_PRIMARY); 596fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p = (uint32_t)(ce >> 32); 597fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t pIndex = rootElements.findPrimary(p); 598fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isCompressible = baseData->isCompressiblePrimary(p); 599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 600fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = Collation::makeCE(p); 601fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode); 602fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 603fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 606fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nodeHasAnyBefore(node)) { 607fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Get the first node that was tailored before this one at a weaker strength. 608fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nodeHasBefore2(node)) { 609fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 610fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 611fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 612fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nodeHasBefore3(node)) { 613fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 614fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 615fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(nodes.elementAti(index))); 616fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = tempCEFromIndexAndStrength(index, strength); 617fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 618fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 619fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // odd pos = [last xyz] 620fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the last node that was tailored after the [last xyz] 621fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // at a strength no greater than the position's strength. 622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 623fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex = nextIndexFromNode(node); 624fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextIndex == 0) { break; } 625fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t nextNode = nodes.elementAti(nextIndex); 626fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(nextNode) < strength) { break; } 627fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndex; 628fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nextNode; 629fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 630fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not make a temporary CE for a root node. 631fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // This last node might be the node for the root CE itself, 632fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // or a node with a common secondary or tertiary weight. 633fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 634fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = tempCEFromIndexAndStrength(index, strength); 635fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 636fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 637fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce; 638fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 639fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 640fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 641fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix, 642fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UnicodeString &str, const UnicodeString &extension, 643fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&parserErrorReason, UErrorCode &errorCode) { 644fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 645fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdPrefix; 646fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!prefix.isEmpty()) { 647fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfd.normalize(prefix, nfdPrefix, errorCode); 648fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 649fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "normalizing the relation prefix"; 650fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 651fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 652fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdString = nfd.normalize(str, errorCode); 654fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 655fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "normalizing the relation string"; 656fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 657fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 658fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 659fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The runtime code decomposes Hangul syllables on the fly, 660fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // with recursive processing but without making the Jamo pieces visible for matching. 661fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It does not work with certain types of contextual mappings. 662fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nfdLength = nfdString.length(); 663fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfdLength >= 2) { 664fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar c = nfdString.charAt(0); 665fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { 666fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // While handling a Hangul syllable, contractions starting with Jamo L or V 667fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // would not see the following Jamo of that syllable. 668fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 669fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "contractions starting with conjoining Jamo L or V not supported"; 670fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 671fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 672fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = nfdString.charAt(nfdLength - 1); 673fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Hangul::isJamoL(c) || 674fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) { 675fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // A contraction ending with Jamo L or L+V would require 676fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 677fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // or decomposing a following Hangul syllable on the fly, during contraction matching. 678fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 679fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported"; 680fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 681fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 682fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // A Hangul syllable completely inside a contraction is ok. 683fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 684fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: If there is a prefix, then the parser checked that 685fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 686fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 687fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (While handling a Hangul syllable, prefixes on Jamo V or T 688fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // would not see the previous Jamo of that syllable.) 689fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 690fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength != UCOL_IDENTICAL) { 691fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the node index after which we insert the new tailored node. 692fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 693fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(cesLength > 0); 694fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = ces[cesLength - 1]; 695fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) { 696fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There is no primary gap between ignorables and the space-first-primary. 697fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 698fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "tailoring primary after ignorables not supported"; 699fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 700fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 701fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_QUATERNARY && ce == 0) { 702fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The CE data structure does not support non-zero quaternary weights 703fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // on tertiary ignorables. 704fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 705fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "tailoring quaternary after tertiary ignorables not supported"; 706fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 707fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 708fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert the new tailored node. 709fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = insertTailoredNodeAfter(index, strength, errorCode); 710fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 711fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "modifying collation elements"; 712fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 713fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 714fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Strength of the temporary CE: 715fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The new relation may yield a stronger CE but not a weaker one. 716fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t tempStrength = ceStrength(ce); 717fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength < tempStrength) { tempStrength = strength; } 718fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 719fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 720fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 721fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius setCaseBits(nfdString, parserErrorReason, errorCode); 722fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 723fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 724fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t cesLengthBeforeExtension = cesLength; 725fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!extension.isEmpty()) { 726fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdExtension = nfd.normalize(extension, errorCode); 727fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 728fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "normalizing the relation extension"; 729fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 730fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 731fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength); 732fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 733fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 734fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = 735fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius "extension string adds too many collation elements (more than 31 total)"; 736fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 737fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 738fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 739fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = Collation::UNASSIGNED_CE32; 740fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((prefix != nfdPrefix || str != nfdString) && 741fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { 742fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Map from the original input to the CEs. 743fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do this in case the canonical closure is incomplete, 744fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // so that it is possible to explicitly provide the missing mappings. 745fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode); 746fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 747fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); 748fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 749fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "writing collation elements"; 750fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 751fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 752fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = cesLengthBeforeExtension; 753fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 754fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 755fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 756fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason, 757fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 758fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 759fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY); 760fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 761fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the last CE that is at least as "strong" as the requested difference. 762fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: Stronger is smaller (UCOL_PRIMARY=0). 763fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce; 764fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;; --cesLength) { 765fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength == 0) { 766fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = ces[0] = 0; 767fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = 1; 768fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 769fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 770fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = ces[cesLength - 1]; 771fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 772fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ceStrength(ce) <= strength) { break; } 773fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 774fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 775fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTempCE(ce)) { 776fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No need to findCommonNode() here for lower levels 777fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // because insertTailoredNodeAfter() will do that anyway. 778fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return indexFromTempCE(ce); 779fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 780fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 781fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // root CE 782fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) { 783fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 784fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "tailoring relative to an unassigned code point not supported"; 785fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 786fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 787fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return findOrInsertNodeForRootCE(ce, strength, errorCode); 788fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 789fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 790fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 791fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) { 792fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 793fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE); 794fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 795fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find or insert the node for each of the root CE's weights, 796fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // down to the requested level/strength. 797fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 798fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT((ce & 0xc0) == 0); 799fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode); 800fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_SECONDARY) { 801fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t lower32 = (uint32_t)ce; 802fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode); 803fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_TERTIARY) { 804fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK, 805fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_TERTIARY, errorCode); 806fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 807fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 808fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 809fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 810fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 811fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace { 812fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 813fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/** 814fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Like Java Collections.binarySearch(List, key, Comparator). 815fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 816fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * @return the index>=0 where the item was found, 817fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * or the index<0 for inserting the string at ~index in sorted order 818fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * (index into rootPrimaryIndexes) 819fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 820fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 821fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusbinarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length, 822fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t *nodes, uint32_t p) { 823fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(length == 0) { return ~0; } 824fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t start = 0; 825fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t limit = length; 826fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for (;;) { 827fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = (start + limit) / 2; 828fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes[rootPrimaryIndexes[i]]; 829fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node) 830fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (p == nodePrimary) { 831fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return i; 832fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if (p < nodePrimary) { 833fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (i == start) { 834fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ~start; // insert s before i 835fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 836fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius limit = i; 837fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 838fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (i == start) { 839fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ~(start + 1); // insert s after i 840fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 841fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius start = i; 842fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 843fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 844fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 845fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 846fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} // namespace 847fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 848fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 849fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) { 850fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 851fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 852fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t rootIndex = binarySearchForRootPrimaryNode( 853fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 854fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(rootIndex >= 0) { 855fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return rootPrimaryIndexes.elementAti(rootIndex); 856fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 857fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Start a new list of nodes with this primary. 858fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = nodes.size(); 859fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.addElement(nodeFromWeight32(p), errorCode); 860fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode); 861fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 862fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 863fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 864fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 865fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 866fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) { 867fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 868fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(0 <= index && index < nodes.size()); 869fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 870fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16); 871fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Only reset-before inserts common weights. 872fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(weight16 == Collation::COMMON_WEIGHT16) { 873fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return findCommonNode(index, level); 874fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 875fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the root CE's weight for this level. 876fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Postpone insertion if not found: 877fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert the new root node before the next stronger node, 878fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // or before the next root node with the same strength and a larger weight. 879fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 880fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex; 881fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((nextIndex = nextIndexFromNode(node)) != 0) { 882fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(nextIndex); 883fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextStrength = strengthFromNode(node); 884fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextStrength <= level) { 885fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert before a stronger node. 886fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextStrength < level) { break; } 887fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // nextStrength == level 888fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!isTailoredNode(node)) { 889fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t nextWeight16 = weight16FromNode(node); 890fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextWeight16 == weight16) { 891fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Found the node for the root CE up to this level. 892fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return nextIndex; 893fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 894fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert before a node with a larger same-strength weight. 895fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextWeight16 > weight16) { break; } 896fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 897fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 898fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Skip the next node. 899fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndex; 900fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 901fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodeFromWeight16(weight16) | nodeFromStrength(level); 902fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return insertNodeBetween(index, nextIndex, node, errorCode); 903fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 904fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 905fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 906fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) { 907fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 908fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(0 <= index && index < nodes.size()); 909fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_SECONDARY) { 910fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findCommonNode(index, UCOL_SECONDARY); 911fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_TERTIARY) { 912fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findCommonNode(index, UCOL_TERTIARY); 913fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 914fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 915fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Postpone insertion: 916fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert the new node before the next one with a strength at least as strong. 917fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 918fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex; 919fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((nextIndex = nextIndexFromNode(node)) != 0) { 920fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(nextIndex); 921fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) <= strength) { break; } 922fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Skip the next node which has a weaker (larger) strength than the new one. 923fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndex; 924fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 925fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = IS_TAILORED | nodeFromStrength(strength); 926fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return insertNodeBetween(index, nextIndex, node, errorCode); 927fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 928fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 929fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 930fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node, 931fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 932fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 933fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(previousIndexFromNode(node) == 0); 934fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nextIndexFromNode(node) == 0); 935fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 936fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Append the new node and link it to the existing nodes. 937fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t newIndex = nodes.size(); 938fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 939fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.addElement(node, errorCode); 940fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 941fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // nodes[index].nextIndex = newIndex 942fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 943fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 944fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // nodes[nextIndex].previousIndex = newIndex 945fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextIndex != 0) { 946fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(nextIndex); 947fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 948fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 949fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return newIndex; 950fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 951fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 952fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 953fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findCommonNode(int32_t index, int32_t strength) const { 954fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY); 955fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 956fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) >= strength) { 957fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The current node is no stronger. 958fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 959fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 960fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 961fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The current node implies the strength-common weight. 962fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 963fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 964fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(node); 965fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 966fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && 967fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius weight16FromNode(node) == BEFORE_WEIGHT16); 968fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Skip to the explicit common node. 969fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius do { 970fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(node); 971fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 972fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(strengthFromNode(node) >= strength); 973fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } while(isTailoredNode(node) || strengthFromNode(node) > strength); 974fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16); 975fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 976fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 977fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 978fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 979fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::setCaseBits(const UnicodeString &nfdString, 980fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&parserErrorReason, UErrorCode &errorCode) { 981fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 982fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t numTailoredPrimaries = 0; 983fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < cesLength; ++i) { 984fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; } 985fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 986fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We should not be able to get too many case bits because 987fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // cesLength<=31==MAX_EXPANSION_LENGTH. 988fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // 31 pairs of case bits fit into an int64_t without setting its sign bit. 989fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(numTailoredPrimaries <= 31); 990fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 991fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t cases = 0; 992fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(numTailoredPrimaries > 0) { 993fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UChar *s = nfdString.getBuffer(); 994fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length()); 995fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1; 996fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 997fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "fetching root CEs for tailored string"; 998fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 999fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1000fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE); 1001fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1002fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t lastCase = 0; 1003fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t numBasePrimaries = 0; 1004fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < baseCEsLength; ++i) { 1005fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = baseCEs.getCE(i); 1006fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((ce >> 32) != 0) { 1007fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++numBasePrimaries; 1008fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t c = ((uint32_t)ce >> 14) & 3; 1009fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 1010fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(numBasePrimaries < numTailoredPrimaries) { 1011fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cases |= (int64_t)c << ((numBasePrimaries - 1) * 2); 1012fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(numBasePrimaries == numTailoredPrimaries) { 1013fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius lastCase = c; 1014fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(c != lastCase) { 1015fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There are more base primary CEs than tailored primaries. 1016fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Set mixed case if the case bits of the remainder differ. 1017fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius lastCase = 1; 1018fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Nothing more can change. 1019fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1020fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1021fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1022fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1023fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(numBasePrimaries >= numTailoredPrimaries) { 1024fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2); 1025fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1026fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1027fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1028fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < cesLength; ++i) { 1029fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits 1030fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t strength = ceStrength(ce); 1031fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_PRIMARY) { 1032fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce |= (cases & 3) << 14; 1033fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cases >>= 2; 1034fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(strength == UCOL_TERTIARY) { 1035fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Tertiary CEs must have uppercase bits. 1036fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // See the LDML spec, and comments in class CollationCompare. 1037fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce |= 0x8000; 1038fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1039fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Tertiary ignorable CEs must have 0 case bits. 1040fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We set 0 case bits for secondary CEs too 1041fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // since currently only U+0345 is cased and maps to a secondary CE, 1042fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and it is lowercase. Other secondaries are uncased. 1043fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 1044fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[i] = ce; 1045fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1046fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1047fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1048fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1049fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason, 1050fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1051fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1052fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->suppressContractions(set, errorCode); 1053fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 1054fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "application of [suppressContractions [set]] failed"; 1055fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1056fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1057fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1058fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1059fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */, 1060fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1061fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1062fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius optimizeSet.addAll(set); 1063fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1064fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1065fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1066fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1067fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1068fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1069fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Map from the NFD input to the CEs. 1070fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1071fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1072fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addTailComposites(nfdPrefix, nfdString, errorCode); 1073fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 1074fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1075fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1076fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1077fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1078fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1079fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1080fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1081fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1082fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 1083fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfdPrefix.isEmpty()) { 1084fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CanonicalIterator stringIter(nfdString, errorCode); 1085fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1086fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString prefix; 1087fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1088fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString str = stringIter.next(); 1089fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(str.isBogus()) { break; } 1090fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ignoreString(str, errorCode) || str == nfdString) { continue; } 1091fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1092fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1093fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1094fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1095fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CanonicalIterator prefixIter(nfdPrefix, errorCode); 1096fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CanonicalIterator stringIter(nfdString, errorCode); 1097fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1098fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1099fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString prefix = prefixIter.next(); 1100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(prefix.isBogus()) { break; } 1101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ignorePrefix(prefix, errorCode)) { continue; } 1102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool samePrefix = prefix == nfdPrefix; 1103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString str = stringIter.next(); 1105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(str.isBogus()) { break; } 1106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; } 1107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius stringIter.reset(); 1111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 1114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Look for the last starter in the NFD string. 1122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 lastStarter; 1123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t indexAfterLastStarter = nfdString.length(); 1124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(indexAfterLastStarter == 0) { return; } // no starter at all 1126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius lastStarter = nfdString.char32At(indexAfterLastStarter - 1); 1127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfd.getCombiningClass(lastStarter) == 0) { break; } 1128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius indexAfterLastStarter -= U16_LENGTH(lastStarter); 1129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No closure to Hangul syllables since we decompose them on the fly. 1131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Hangul::isJamoL(lastStarter)) { return; } 1132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Are there any composites whose decomposition starts with the lastStarter? 1134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 1135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We might find some more equivalent mappings here if it did. 1136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet composites; 1137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 1138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString decomp; 1140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString newNFDString, newString; 1141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t newCEs[Collation::MAX_EXPANSION_LENGTH]; 1142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(composites); 1143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(iter.next()) { 1144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!iter.isString()); 1145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 composite = iter.getCodepoint(); 1146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfd.getDecomposition(composite, decomp); 1147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 1148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString, newString, errorCode)) { 1149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 1150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1151fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0); 1152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { 1153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Ignore mappings that we cannot store. 1154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 1155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: It is possible that the newCEs do not make use of the mapping 1157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // for which we are adding the tail composites, in which case we might be adding 1158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // unnecessary mappings. 1159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // For example, when we add tail composites for ae^ (^=combining circumflex), 1160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // UCA discontiguous-contraction matching does not find any matches 1161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // for ae_^ (_=any combining diacritic below) *unless* there is also 1162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // a contraction mapping for ae. 1163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Thus, if there is no ae contraction, then the ae^ mapping is ignored 1164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // while fetching the newCEs for ae_^. 1165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // TODO: Try to detect this effectively. 1166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (Alternatively, print a warning when prefix contractions are missing.) 1167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do not need an explicit mapping for the NFD strings. 1169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It is fine if the NFD input collates like this via a sequence of mappings. 1170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It also saves a little bit of space, and may reduce the set of characters with contractions. 1171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = addIfDifferent(nfdPrefix, newString, 1172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode); 1173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 != Collation::UNASSIGNED_CE32) { 1174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // was different, was added 1175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode); 1176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1181fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString, 1182fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t indexAfterLastStarter, 1183fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 composite, const UnicodeString &decomp, 1184fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString &newNFDString, UnicodeString &newString, 1185fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) const { 1186fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return FALSE; } 1187fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0)); 1188fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t lastStarterLength = decomp.moveIndex32(0, 1); 1189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(lastStarterLength == decomp.length()) { 1190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Singleton decompositions should be found by addWithClosure() 1191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and the CanonicalIterator, so we can ignore them here. 1192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1194fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfdString.compare(indexAfterLastStarter, 0x7fffffff, 1195fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decomp, lastStarterLength, 0x7fffffff) == 0) { 1196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // same strings, nothing new to be found here 1197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1199fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1200fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Make new FCD strings that combine a composite, or its decomposition, 1201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // into the nfdString's last starter and the combining marks following it. 1202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Make an NFD version, and a version with the composite. 1203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.setTo(nfdString, 0, indexAfterLastStarter); 1204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite); 1205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The following is related to discontiguous contraction matching, 1207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // but builds only FCD strings (or else returns FALSE). 1208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t sourceIndex = indexAfterLastStarter; 1209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t decompIndex = lastStarterLength; 1210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Small optimization: We keep the source character across loop iterations 1211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // because we do not always consume it, 1212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and then need not fetch it again nor look up its combining class again. 1213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 sourceChar = U_SENTINEL; 1214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The cc variables need to be declared before the loop so that at the end 1215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // they are set to the last combining classes seen. 1216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t sourceCC = 0; 1217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t decompCC = 0; 1218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sourceChar < 0) { 1220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sourceIndex >= nfdString.length()) { break; } 1221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sourceChar = nfdString.char32At(sourceIndex); 1222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sourceCC = nfd.getCombiningClass(sourceChar); 1223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(sourceCC != 0); 1224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We consume a decomposition character in each iteration. 1226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(decompIndex >= decomp.length()) { break; } 1227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 decompChar = decomp.char32At(decompIndex); 1228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decompCC = nfd.getCombiningClass(decompChar); 1229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Compare the two characters and their combining classes. 1230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(decompCC == 0) { 1231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Unable to merge because the source contains a non-zero combining mark 1232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // but the composite's decomposition contains another starter. 1233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The strings would not be equivalent. 1234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(sourceCC < decompCC) { 1236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Composite + sourceChar would not be FCD. 1237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(decompCC < sourceCC) { 1239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.append(decompChar); 1240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decompIndex += U16_LENGTH(decompChar); 1241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(decompChar != sourceChar) { 1242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Blocked because same combining class. 1243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { // match: decompChar == sourceChar 1245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.append(decompChar); 1246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decompIndex += U16_LENGTH(decompChar); 1247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sourceIndex += U16_LENGTH(decompChar); 1248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sourceChar = U_SENTINEL; 1249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We are at the end of at least one of the two inputs. 1252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sourceChar >= 0) { // more characters from nfdString but not from decomp 1253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sourceCC < decompCC) { 1254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Appending the next source character to the composite would not be FCD. 1255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.append(nfdString, sourceIndex, 0x7fffffff); 1258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newString.append(nfdString, sourceIndex, 0x7fffffff); 1259fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 1260fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.append(decomp, decompIndex, 0x7fffffff); 1261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nfd.isNormalized(newNFDString, errorCode)); 1263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(fcd.isNormalized(newString, errorCode)); 1264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent 1265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 1266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const { 1270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not map non-FCD prefixes. 1271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return !isFCD(s, errorCode); 1272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const { 1276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not map non-FCD strings. 1277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not map strings that start with Hangul syllables: We decompose those on the fly. 1278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0)); 1279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const { 1283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode); 1284fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::closeOverComposites(UErrorCode &errorCode) { 1288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final 1289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Hangul is decomposed on the fly during collation. 1291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 1292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString prefix; // empty 1293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdString; 1294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(composites); 1295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(iter.next()) { 1296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!iter.isString()); 1297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfd.getDecomposition(iter.getCodepoint(), nfdString); 1298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = dataBuilder->getCEs(nfdString, ces, 0); 1299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 1300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Too many CEs from the decomposition (unusual), ignore this composite. 1301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We could add a capacity parameter to getCEs() and reallocate if necessary. 1302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // However, this can only really happen in contrived cases. 1303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 1304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UnicodeString &composite(iter.getString()); 1306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode); 1307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str, 1312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH]; 1316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0); 1317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 1318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == Collation::UNASSIGNED_CE32) { 1319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode); 1320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->addCE32(prefix, str, ce32, errorCode); 1322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 1324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length, 1328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t ces2[], int32_t ces2Length) { 1329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ces1Length != ces2Length) { 1330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH); 1333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < ces1Length; ++i) { 1334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ces1[i] != ces2[i]) { return FALSE; } 1335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 1337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusalignWeightRight(uint32_t w) { 1343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(w != 0) { 1344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((w & 0xff) == 0) { w >>= 8; } 1345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return w; 1347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { 1353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationWeights primaries, secondaries, tertiaries; 1356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t *nodesArray = nodes.getBuffer(); 1357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 1359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = rootPrimaryIndexes.elementAti(rpi); 1360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodesArray[i]; 1361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p = weight32FromNode(node); 1362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16; 1363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t t = s; 1364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t q = 0; 1365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool pIsTailored = FALSE; 1366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool sIsTailored = FALSE; 1367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool tIsTailored = FALSE; 1368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf("\nprimary %lx\n", (long)alignWeightRight(p)); 1370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 1372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex = nextIndexFromNode(node); 1373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(nextIndex != 0) { 1374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius i = nextIndex; 1375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodesArray[i]; 1376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nextIndex = nextIndexFromNode(node); 1377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t strength = strengthFromNode(node); 1378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_QUATERNARY) { 1379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(node)); 1380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" quat+ "); 1382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(q == 3) { 1384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "quaternary tailoring gap too small"; 1386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++q; 1389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_TERTIARY) { 1391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 1392fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1393fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" ter+ "); 1394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1395fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!tIsTailored) { 1396fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // First tailored tertiary node for [p, s]. 1397fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t tCount = countTailoredNodes(nodesArray, nextIndex, 1398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_TERTIARY) + 1; 1399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t tLimit; 1400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(t == 0) { 1401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Gap at the beginning of the tertiary CE range. 1402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t = rootElements.getTertiaryBoundary() - 0x100; 1403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK; 1404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(t == BEFORE_WEIGHT16) { 1405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tLimit = Collation::COMMON_WEIGHT16; 1406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(!pIsTailored && !sIsTailored) { 1407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // p and s are root weights. 1408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 1409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // [p, s] is tailored. 1411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(t == Collation::COMMON_WEIGHT16); 1412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tLimit = rootElements.getTertiaryBoundary(); 1413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0); 1415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tertiaries.initForTertiary(); 1416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!tertiaries.allocWeights(t, tLimit, tCount)) { 1417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "tertiary tailoring gap too small"; 1419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tIsTailored = TRUE; 1422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t = tertiaries.nextWeight(); 1424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(t != 0xffffffff); 1425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t = weight16FromNode(node); 1427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tIsTailored = FALSE; 1428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" ter %lx\n", (long)alignWeightRight(t)); 1430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY) { 1434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 1435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" sec+ "); 1437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1438fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!sIsTailored) { 1439fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // First tailored secondary node for p. 1440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t sCount = countTailoredNodes(nodesArray, nextIndex, 1441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_SECONDARY) + 1; 1442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t sLimit; 1443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(s == 0) { 1444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Gap at the beginning of the secondary CE range. 1445fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = rootElements.getSecondaryBoundary() - 0x100; 1446fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sLimit = rootElements.getFirstSecondaryCE() >> 16; 1447fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(s == BEFORE_WEIGHT16) { 1448fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sLimit = Collation::COMMON_WEIGHT16; 1449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(!pIsTailored) { 1450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // p is a root primary. 1451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sLimit = rootElements.getSecondaryAfter(pIndex, s); 1452fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1453fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // p is a tailored primary. 1454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(s == Collation::COMMON_WEIGHT16); 1455fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sLimit = rootElements.getSecondaryBoundary(); 1456fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1457fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(s == Collation::COMMON_WEIGHT16) { 1458fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not tailor into the getSortKey() range of 1459fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // compressed common secondaries. 1460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = rootElements.getLastCommonSecondary(); 1461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius secondaries.initForSecondary(); 1463fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!secondaries.allocWeights(s, sLimit, sCount)) { 1464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "secondary tailoring gap too small"; 1466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1468fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sIsTailored = TRUE; 1469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = secondaries.nextWeight(); 1471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(s != 0xffffffff); 1472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = weight16FromNode(node); 1474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sIsTailored = FALSE; 1475fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1476fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" sec %lx\n", (long)alignWeightRight(s)); 1477fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1478fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1479fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else /* UCOL_PRIMARY */ { 1480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(node)); 1481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf("pri+ "); 1483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!pIsTailored) { 1485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // First tailored primary node in this list. 1486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t pCount = countTailoredNodes(nodesArray, nextIndex, 1487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_PRIMARY) + 1; 1488fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isCompressible = baseData->isCompressiblePrimary(p); 1489fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t pLimit = 1490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootElements.getPrimaryAfter(p, pIndex, isCompressible); 1491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius primaries.initForPrimary(isCompressible); 1492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!primaries.allocWeights(p, pLimit, pCount)) { 1493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: introduce a more specific UErrorCode? 1494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "primary tailoring gap too small"; 1495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pIsTailored = TRUE; 1498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p = primaries.nextWeight(); 1500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(p != 0xffffffff); 1501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = Collation::COMMON_WEIGHT16; 1502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sIsTailored = FALSE; 1503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1504fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t = s == 0 ? 0 : Collation::COMMON_WEIGHT16; 1505fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tIsTailored = FALSE; 1506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1507fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius q = 0; 1508fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 1510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodesArray[i] = Collation::makeCE(p, s, t, q); 1511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf("%016llx\n", (long long)nodesArray[i]); 1513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 1520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) { 1521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t count = 0; 1522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(i == 0) { break; } 1524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodesArray[i]; 1525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) < strength) { break; } 1526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == strength) { 1527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 1528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++count; 1529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius i = nextIndexFromNode(node); 1534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return count; 1536fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1537fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1538fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass CEFinalizer : public CollationDataBuilder::CEModifier { 1539fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 1540fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CEFinalizer(const int64_t *ces) : finalCEs(ces) {} 1541fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual ~CEFinalizer(); 1542fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual int64_t modifyCE32(uint32_t ce32) const { 1543fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!Collation::isSpecialCE32(ce32)); 1544fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationBuilder::isTempCE32(ce32)) { 1545fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // retain case bits 1546fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 1547fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1548fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::NO_CE; 1549fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1550fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1551fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual int64_t modifyCE(int64_t ce) const { 1552fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationBuilder::isTempCE(ce)) { 1553fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // retain case bits 1554fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000); 1555fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1556fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::NO_CE; 1557fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1558fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1559fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate: 1561fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t *finalCEs; 1562fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 1563fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1564fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCEFinalizer::~CEFinalizer() {} 1565fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1566fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1567fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::finalizeCEs(UErrorCode &errorCode) { 1568fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1569fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode)); 1570fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(newBuilder.isNull()) { 1571fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MEMORY_ALLOCATION_ERROR; 1572fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1573fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1574fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newBuilder->initForTailoring(baseData, errorCode); 1575fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CEFinalizer finalizer(nodes.getBuffer()); 1576fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newBuilder->copyFrom(*dataBuilder, finalizer, errorCode); 1577fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1578fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete dataBuilder; 1579fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder = newBuilder.orphan(); 1580fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1581fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1582fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 1583fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ceStrength(int64_t ce) { 1584fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 1585fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isTempCE(ce) ? strengthFromTempCE(ce) : 1586fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY : 1587fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY : 1588fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce != 0 ? UCOL_TERTIARY : 1589fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_IDENTICAL; 1590fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1591fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1592fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END 1593fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1594fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_USE 1595fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1596fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CAPI UCollator * U_EXPORT2 1597fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusucol_openRules(const UChar *rules, int32_t rulesLength, 1598fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UColAttributeValue normalizationMode, UCollationStrength strength, 1599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UParseError *parseError, UErrorCode *pErrorCode) { 1600fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(*pErrorCode)) { return NULL; } 1601fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(rules == NULL && rulesLength != 0) { 1602fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1603fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 1604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius RuleBasedCollator *coll = new RuleBasedCollator(); 1606fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(coll == NULL) { 1607fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 1608fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 1609fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1610fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength); 1611fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode); 1612fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(*pErrorCode)) { 1613fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete coll; 1614fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 1615fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1616fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return coll->toUCollator(); 1617fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1618fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1619fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic const int32_t internalBufferSize = 512; 1620fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1621fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp 1622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// because it calls UnicodeSet "builder" code that depends on all Unicode properties, 1623fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// and the rest of the collation "runtime" code only depends on normalization. 1624fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// This function is not related to the collation builder, 1625fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// but it did not seem worth moving it into its own .cpp file, 1626fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods. 1627fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CAPI int32_t U_EXPORT2 1628fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusucol_getUnsafeSet( const UCollator *coll, 1629fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius USet *unsafe, 1630fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode *status) 1631fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{ 1632fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar buffer[internalBufferSize]; 1633fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t len = 0; 1634fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1635fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_clear(unsafe); 1636fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1637fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 1638fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 1639fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 1640fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1641fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // add chars that fail the fcd check 1642fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 1643fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1644fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // add lead/trail surrogates 1645fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*, 1646fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // not when testing code *points*) 1647fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_addRange(unsafe, 0xd800, 0xdfff); 1648fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1649fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius USet *contractions = uset_open(0,0); 1650fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1651fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = 0, j = 0; 1652fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 1653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t contsSize = uset_size(contractions); 1654fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = 0; 1655fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Contraction set consists only of strings 1656fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // to get unsafe code points, we need to 1657fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // break the strings apart and add them to the unsafe set 1658fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(i = 0; i < contsSize; i++) { 1659fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 1660fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(len > 0) { 1661fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius j = 0; 1662fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(j < len) { 1663fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U16_NEXT(buffer, j, len, c); 1664fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(j < len) { 1665fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_add(unsafe, c); 1666fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1667fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1668fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1669fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1670fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1671fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_close(contractions); 1672fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1673fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return uset_size(unsafe); 1674fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1675fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1676fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif // !UCONFIG_NO_COLLATION 1677