164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/* 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 5fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Copyright (C) 2013-2014, International Business Machines 6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Corporation and others. All Rights Reserved. 7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* collationbuilder.cpp 9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 10fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* (replaced the former ucol_bld.cpp) 11fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created on: 2013may06 13fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created by: Markus W. Scherer 14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/ 15fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 16fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 17fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include <stdio.h> 18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 19fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 20fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utypes.h" 21fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 22fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#if !UCONFIG_NO_COLLATION 23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/caniter.h" 25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/normalizer2.h" 26fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/tblcoll.h" 27fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/parseerr.h" 28fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uchar.h" 29fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/ucol.h" 30fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/unistr.h" 31fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/usetiter.h" 32fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utf16.h" 33fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uversion.h" 34fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "cmemory.h" 35fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collation.h" 36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationbuilder.h" 37fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdata.h" 38fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdatabuilder.h" 39fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationfastlatin.h" 40fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationroot.h" 41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationrootelements.h" 42fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationruleparser.h" 43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationsettings.h" 44fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationtailoring.h" 45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationweights.h" 46fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "normalizer2impl.h" 47fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "uassert.h" 48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "ucol_imp.h" 49fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "utf16collationiterator.h" 50fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN 52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace { 54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 55fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass BundleImporter : public CollationRuleParser::Importer { 56fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 57f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius BundleImporter() {} 58fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual ~BundleImporter(); 59f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius virtual void getRules( 60fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *localeID, const char *collationType, 61f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeString &rules, 62fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&errorReason, UErrorCode &errorCode); 63fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 64fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 65f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusBundleImporter::~BundleImporter() {} 66fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 67f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid 68fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusBundleImporter::getRules( 69fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *localeID, const char *collationType, 70f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius UnicodeString &rules, 71fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *& /*errorReason*/, UErrorCode &errorCode) { 72f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius CollationLoader::loadRules(localeID, collationType, rules, errorCode); 73fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 74fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 75fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} // namespace 76fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 77fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// RuleBasedCollator implementation ---------------------------------------- *** 78fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 79fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// These methods are here, rather than in rulebasedcollator.cpp, 80fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// for modularization: 81fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// Most code using Collator does not need to build a Collator from rules. 82fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// By moving these constructors and helper methods to a separate file, 83fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// most code will not have a static dependency on the builder code. 84fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 85fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator() 86fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 87fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 88fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 89f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius cacheEntry(NULL), 90fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 91fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 92fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 93fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 94fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 95fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode) 96fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 97fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 98fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 99f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius cacheEntry(NULL), 100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode); 104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength, 107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) 108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 111f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius cacheEntry(NULL), 112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode); 116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UColAttributeValue decompositionMode, 120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) 121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 124f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius cacheEntry(NULL), 125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode); 129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ECollationStrength strength, 133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UColAttributeValue decompositionMode, 134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) 135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 138f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius cacheEntry(NULL), 139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode); 143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UParseError &parseError, UnicodeString &reason, 147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) 148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius settings(NULL), 150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring(NULL), 151f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius cacheEntry(NULL), 152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius validLocale(""), 153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius explicitlySetAttributes(0), 154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius actualLocaleIsSameAsValid(FALSE) { 155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode); 156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::internalBuildTailoring(const UnicodeString &rules, 160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t strength, 161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UColAttributeValue decompositionMode, 162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UParseError *outParseError, UnicodeString *outReason, 163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationTailoring *base = CollationRoot::getRoot(errorCode); 165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(outReason != NULL) { outReason->remove(); } 167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationBuilder builder(base, errorCode); 168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UVersionInfo noVersion = { 0, 0, 0, 0 }; 169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius BundleImporter importer; 170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion, 171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius &importer, 172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius outParseError, errorCode)); 173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *reason = builder.getErrorReason(); 175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(reason != NULL && outReason != NULL) { 176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *outReason = UnicodeString(reason, -1, US_INV); 177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t->actualLocale.setToBogus(); 181f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius adoptTailoring(t.orphan(), errorCode); 182dbc22bd174be483711cea006f3189d8289835830ccornelius // Set attributes after building the collator, 183dbc22bd174be483711cea006f3189d8289835830ccornelius // to keep the default settings consistent with the rule string. 184dbc22bd174be483711cea006f3189d8289835830ccornelius if(strength != UCOL_DEFAULT) { 185dbc22bd174be483711cea006f3189d8289835830ccornelius setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode); 186dbc22bd174be483711cea006f3189d8289835830ccornelius } 187dbc22bd174be483711cea006f3189d8289835830ccornelius if(decompositionMode != UCOL_DEFAULT) { 188dbc22bd174be483711cea006f3189d8289835830ccornelius setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode); 189dbc22bd174be483711cea006f3189d8289835830ccornelius } 190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// CollationBuilder implementation ----------------------------------------- *** 193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// Some compilers don't care if constants are defined in the .cpp file. 1951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// MS Visual C++ does not like it, but gcc requires it. clang does not care. 1961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#ifndef _MSC_VER 1971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertconst int32_t CollationBuilder::HAS_BEFORE2; 1981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertconst int32_t CollationBuilder::HAS_BEFORE3; 1991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif 2001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode) 202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : nfd(*Normalizer2::getNFDInstance(errorCode)), 203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius fcd(*Normalizer2Factory::getFCDInstance(errorCode)), 204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), 205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius base(b), 206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius baseData(b->data), 207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootElements(b->data->rootElements, b->data->rootElementsLength), 208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius variableTop(0), 209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE), 210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason(NULL), 211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength(0), 212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootPrimaryIndexes(errorCode), nodes(errorCode) { 213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfcImpl.ensureCanonIterData(errorCode); 214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "CollationBuilder fields initialization failed"; 216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(dataBuilder == NULL) { 219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MEMORY_ALLOCATION_ERROR; 220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->initForTailoring(baseData, errorCode); 223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "CollationBuilder initialization failed"; 225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::~CollationBuilder() { 229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete dataBuilder; 230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationTailoring * 233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::parseAndBuild(const UnicodeString &ruleString, 234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UVersionInfo rulesVersion, 235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationRuleParser::Importer *importer, 236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UParseError *outParseError, 237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(baseData->rootElements == NULL) { 240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MISSING_RESOURCE_ERROR; 241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "missing root elements data, tailoring not supported"; 242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings)); 245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(tailoring.isNull() || tailoring->isBogus()) { 246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_MEMORY_ALLOCATION_ERROR; 247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationRuleParser parser(baseData, errorCode); 250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: This always bases &[last variable] and &[first regular] 252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // on the root collator's maxVariable/variableTop. 253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If we wanted this to change after [maxVariable x], then we would keep 254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // the tailoring.settings pointer here and read its variableTop when we need it. 255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // See http://unicode.org/cldr/trac/ticket/6070 256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius variableTop = base->settings->variableTop; 257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parser.setSink(this); 258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parser.setImporter(importer); 259dbc22bd174be483711cea006f3189d8289835830ccornelius CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings); 260dbc22bd174be483711cea006f3189d8289835830ccornelius parser.parse(ruleString, ownedSettings, outParseError, errorCode); 261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = parser.getErrorReason(); 262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(dataBuilder->hasMappings()) { 264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius makeTailoredCEs(errorCode); 265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius closeOverComposites(errorCode); 266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius finalizeCEs(errorCode); 267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Copy all of ASCII, and Latin-1 letters, into each tailoring. 268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius optimizeSet.add(0, 0x7f); 269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius optimizeSet.add(0xc0, 0xff); 270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Hangul is decomposed on the fly during collation, 271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and the tailoring data is always built with HANGUL_TAG specials. 272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->optimize(optimizeSet, errorCode); 274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->ensureOwnedData(errorCode); 275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } 277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->build(*tailoring->ownedData, errorCode); 278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->builder = dataBuilder; 279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder = NULL; 280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->data = baseData; 282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return NULL; } 284dbc22bd174be483711cea006f3189d8289835830ccornelius ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( 285dbc22bd174be483711cea006f3189d8289835830ccornelius tailoring->data, ownedSettings, 286f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries)); 287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->rules = ruleString; 288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination 289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailoring->setVersion(base->version, rulesVersion); 290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return tailoring.orphan(); 291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addReset(int32_t strength, const UnicodeString &str, 295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&parserErrorReason, UErrorCode &errorCode) { 296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!str.isEmpty()); 298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(str.charAt(0) == CollationRuleParser::POS_LEAD) { 299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode); 300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = 1; 301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0); 303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // normal reset to a character or string 305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdString = nfd.normalize(str, errorCode); 306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "normalizing the reset position"; 308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = dataBuilder->getCEs(nfdString, ces, 0); 311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset position maps to too many collation elements (more than 31)"; 314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position 318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // &[before strength]position 320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY); 321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 3251b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // If the index is for a "weaker" node, 326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // then skip backwards over this and further "weaker" nodes. 327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(strengthFromNode(node) > strength) { 328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = previousIndexFromNode(node); 329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find or insert a node whose index we will put into a temporary CE. 333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == strength && isTailoredNode(node)) { 334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Reset to just before this same-strength tailored node. 335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = previousIndexFromNode(node); 336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(strength == UCOL_PRIMARY) { 337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // root primary node (has no previous index) 338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p = weight32FromNode(node); 339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(p == 0) { 340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset primary-before ignorable not possible"; 342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(p <= rootElements.getFirstPrimary()) { 345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There is no primary gap between ignorables and the space-first-primary. 346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset primary-before first non-ignorable not supported"; 348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(p == Collation::FIRST_TRAILING_PRIMARY) { 351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do not support tailoring to an unassigned-implicit CE. 352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset primary-before [first trailing] not supported"; 354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p)); 357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findOrInsertNodeForPrimary(p, errorCode); 358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Go to the last node in this list: 359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Tailor after the last node between adjacent root nodes. 360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex = nextIndexFromNode(node); 363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextIndex == 0) { break; } 364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndex; 365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // &[before 2] or &[before 3] 368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findCommonNode(index, UCOL_SECONDARY); 369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_TERTIARY) { 370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findCommonNode(index, UCOL_TERTIARY); 371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 3721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // findCommonNode() stayed on the stronger node or moved to 3731b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // an explicit common-weight node of the reset-before strength. 374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == strength) { 376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Found a same-strength node with an explicit weight. 377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t weight16 = weight16FromNode(node); 378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(weight16 == 0) { 379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY) { 381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset secondary-before secondary ignorable not possible"; 382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset tertiary-before completely ignorable not possible"; 384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 3871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16); 3881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Reset to just before this node. 3891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Insert the preceding same-level explicit weight if it is not there already. 3901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Which explicit weight immediately precedes this one? 3911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert weight16 = getWeight16Before(index, node, strength); 3921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Does this preceding weight have a node? 3931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert uint32_t previousWeight16; 394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t previousIndex = previousIndexFromNode(node); 3951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) { 3961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert node = nodes.elementAti(i); 3971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t previousStrength = strengthFromNode(node); 3981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(previousStrength < strength) { 3991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex); 4001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Either the reset element has an above-common weight and 4011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // the parent node provides the implied common weight, 4021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // or the reset element has a weight<=common in the node 4031b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // right after the parent, and we need to insert the preceding weight. 4041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert previousWeight16 = Collation::COMMON_WEIGHT16; 4051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 4061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else if(previousStrength == strength && !isTailoredNode(node)) { 4071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert previousWeight16 = weight16FromNode(node); 4081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert break; 4091b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4101b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Skip weaker nodes and same-level tailored nodes. 4111b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(previousWeight16 == weight16) { 4131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // The preceding weight has a node, 4141b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // maybe with following weaker or tailored nodes. 4151b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Reset to the last of them. 416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = previousIndex; 417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 4181b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Insert a node with the preceding weight, reset to that. 4191b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 4201b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert index = insertNodeBetween(previousIndex, index, node, errorCode); 421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Found a stronger node with implied strength-common weight. 4241b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert uint32_t weight16 = getWeight16Before(index, node, strength); 4251b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert index = findOrInsertWeakNode(index, weight16, strength, errorCode); 426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Strength of the temporary CE = strength of its reset position. 428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Code above raises an error if the before-strength is stronger. 429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = ceStrength(ces[cesLength - 1]); 430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "inserting reset position for &[before n]"; 433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 4381b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertuint32_t 4391b7d32f919554dda9c193b32188251337bc756f1Fredrik RoubertCollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) { 4401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node)); 4411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Collect the root CE weights if this node is for a root CE. 4421b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // If it is not, then return the low non-primary boundary for a tailored CE. 4431b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert uint32_t t; 4441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(strengthFromNode(node) == UCOL_TERTIARY) { 4451b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert t = weight16FromNode(node); 4461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else { 4471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert t = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight. 4481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4491b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert while(strengthFromNode(node) > UCOL_SECONDARY) { 4501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert index = previousIndexFromNode(node); 4511b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert node = nodes.elementAti(index); 4521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(isTailoredNode(node)) { 4541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return Collation::BEFORE_WEIGHT16; 4551b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4561b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert uint32_t s; 4571b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(strengthFromNode(node) == UCOL_SECONDARY) { 4581b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s = weight16FromNode(node); 4591b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else { 4601b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert s = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight. 4611b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4621b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert while(strengthFromNode(node) > UCOL_PRIMARY) { 4631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert index = previousIndexFromNode(node); 4641b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert node = nodes.elementAti(index); 4651b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4661b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(isTailoredNode(node)) { 4671b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return Collation::BEFORE_WEIGHT16; 4681b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4691b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // [p, s, t] is a root CE. Return the preceding weight for the requested level. 4701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert uint32_t p = weight32FromNode(node); 4711b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert uint32_t weight16; 4721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(level == UCOL_SECONDARY) { 4731b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert weight16 = rootElements.getSecondaryBefore(p, s); 4741b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else { 4751b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert weight16 = rootElements.getTertiaryBefore(p, s, t); 4761b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0); 4771b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 4781b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return weight16; 4791b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert} 4801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint64_t 482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::getSpecialResetPosition(const UnicodeString &str, 483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&parserErrorReason, UErrorCode &errorCode) { 484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(str.length() == 2); 485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce; 486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t strength = UCOL_PRIMARY; 487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isBoundary = FALSE; 488fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE; 489fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING); 490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius switch(pos) { 491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_TERTIARY_IGNORABLE: 492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Quaternary CEs are not supported. 493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_TERTIARY_IGNORABLE: 496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { 498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Look for a tailored tertiary node after [0, 0, 0]. 499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode); 500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((index = nextIndexFromNode(node)) != 0) { 503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 504fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY); 505fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) { 506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return tempCEFromIndexAndStrength(index, UCOL_TERTIARY); 507fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 508fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return rootElements.getFirstTertiaryCE(); 510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No need to look for nodeHasAnyBefore() on a tertiary node. 511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_SECONDARY_IGNORABLE: 513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.getLastTertiaryCE(); 514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = UCOL_TERTIARY; 515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { 517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Look for a tailored secondary node after [0, 0, *]. 518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode); 519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((index = nextIndexFromNode(node)) != 0) { 522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = strengthFromNode(node); 524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength < UCOL_SECONDARY) { break; } 525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY) { 526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nodeHasBefore3(node)) { 528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(nodes.elementAti(index))); 530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return tempCEFromIndexAndStrength(index, UCOL_SECONDARY); 532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 536fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 537fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.getFirstSecondaryCE(); 538fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = UCOL_SECONDARY; 539fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 540fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 541fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_PRIMARY_IGNORABLE: 542fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.getLastSecondaryCE(); 543fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius strength = UCOL_SECONDARY; 544fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 545fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_VARIABLE: 546fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.getFirstPrimaryCE(); 547fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isBoundary = TRUE; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 548fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 549fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_VARIABLE: 550fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 551fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 552fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_REGULAR: 553fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 554fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isBoundary = TRUE; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 555fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 556fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_REGULAR: 557fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Use the Hani-first-primary rather than the actual last "regular" CE before it, 558fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // for backward compatibility with behavior before the introduction of 559fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // script-first-primary CEs in the root collator. 560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = rootElements.firstCEWithPrimaryAtLeast( 561fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius baseData->getFirstPrimaryForGroup(USCRIPT_HAN)); 562fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 563f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius case CollationRuleParser::FIRST_IMPLICIT: 564f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius ce = baseData->getSingleCE(0x4e00, errorCode); 565fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 566fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_IMPLICIT: 567fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do not support tailoring to an unassigned-implicit CE. 568fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 569fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "reset to [last implicit] not supported"; 570fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 571fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::FIRST_TRAILING: 572fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); 573fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isBoundary = TRUE; // trailing first primary (there is no mapping for it) 574fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 575fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius case CollationRuleParser::LAST_TRAILING: 576fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 577fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "LDML forbids tailoring to U+FFFF"; 578fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 579fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius default: 580fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(FALSE); 581fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 582fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 583fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 584fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode); 585fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 586fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 587fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((pos & 1) == 0) { 588fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // even pos = [first xyz] 589fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!nodeHasAnyBefore(node) && isBoundary) { 590fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // A <group> first primary boundary is artificially added to FractionalUCA.txt. 591fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It is reachable via its special contraction, but is not normally used. 592fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the first character tailored after the boundary CE, 593fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // or the first real root CE after it. 594fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((index = nextIndexFromNode(node)) != 0) { 595fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // If there is a following node, then it must be tailored 596fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // because there are no root CEs with a boundary primary 597fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and non-common secondary/tertiary weights. 598fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(node)); 600fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = tempCEFromIndexAndStrength(index, strength); 601fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 602fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(strength == UCOL_PRIMARY); 603fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p = (uint32_t)(ce >> 32); 604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t pIndex = rootElements.findPrimary(p); 605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isCompressible = baseData->isCompressiblePrimary(p); 606fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 607fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = Collation::makeCE(p); 608fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode); 609fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 610fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 611fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 612fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 613fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nodeHasAnyBefore(node)) { 614fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Get the first node that was tailored before this one at a weaker strength. 615fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nodeHasBefore2(node)) { 616fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 617fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 618fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 619fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nodeHasBefore3(node)) { 620fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 621fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(nodes.elementAti(index))); 623fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = tempCEFromIndexAndStrength(index, strength); 624fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 625fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 626fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // odd pos = [last xyz] 627fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the last node that was tailored after the [last xyz] 628fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // at a strength no greater than the position's strength. 629fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 630fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex = nextIndexFromNode(node); 631fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextIndex == 0) { break; } 632fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t nextNode = nodes.elementAti(nextIndex); 633fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(nextNode) < strength) { break; } 634fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndex; 635fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nextNode; 636fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 637fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not make a temporary CE for a root node. 638fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // This last node might be the node for the root CE itself, 639fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // or a node with a common secondary or tertiary weight. 640fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 641fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = tempCEFromIndexAndStrength(index, strength); 642fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 643fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 644fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce; 645fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 646fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 647fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 648fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix, 649fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UnicodeString &str, const UnicodeString &extension, 650fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&parserErrorReason, UErrorCode &errorCode) { 651fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 652fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdPrefix; 653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!prefix.isEmpty()) { 654fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfd.normalize(prefix, nfdPrefix, errorCode); 655fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 656fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "normalizing the relation prefix"; 657fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 658fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 659fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 660fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdString = nfd.normalize(str, errorCode); 661fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 662fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "normalizing the relation string"; 663fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 664fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 665fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 666fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The runtime code decomposes Hangul syllables on the fly, 667fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // with recursive processing but without making the Jamo pieces visible for matching. 668fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It does not work with certain types of contextual mappings. 669fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nfdLength = nfdString.length(); 670fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfdLength >= 2) { 671fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar c = nfdString.charAt(0); 672fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { 673fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // While handling a Hangul syllable, contractions starting with Jamo L or V 674fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // would not see the following Jamo of that syllable. 675fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 676fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "contractions starting with conjoining Jamo L or V not supported"; 677fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 678fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 679fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius c = nfdString.charAt(nfdLength - 1); 680fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Hangul::isJamoL(c) || 681fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) { 682fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // A contraction ending with Jamo L or L+V would require 683fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 684fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // or decomposing a following Hangul syllable on the fly, during contraction matching. 685fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 686fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported"; 687fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 688fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 689fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // A Hangul syllable completely inside a contraction is ok. 690fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 691fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: If there is a prefix, then the parser checked that 692fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 693fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 694fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (While handling a Hangul syllable, prefixes on Jamo V or T 695fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // would not see the previous Jamo of that syllable.) 696fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 697fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength != UCOL_IDENTICAL) { 698fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the node index after which we insert the new tailored node. 699fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 700fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(cesLength > 0); 701fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = ces[cesLength - 1]; 702fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) { 703fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There is no primary gap between ignorables and the space-first-primary. 704fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 705fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "tailoring primary after ignorables not supported"; 706fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 707fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 708fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_QUATERNARY && ce == 0) { 709fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The CE data structure does not support non-zero quaternary weights 710fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // on tertiary ignorables. 711fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 712fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "tailoring quaternary after tertiary ignorables not supported"; 713fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 714fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 715fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert the new tailored node. 716fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = insertTailoredNodeAfter(index, strength, errorCode); 717fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 718fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "modifying collation elements"; 719fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 720fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 721fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Strength of the temporary CE: 722fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The new relation may yield a stronger CE but not a weaker one. 723fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t tempStrength = ceStrength(ce); 724fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength < tempStrength) { tempStrength = strength; } 725fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 726fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 727fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 728fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius setCaseBits(nfdString, parserErrorReason, errorCode); 729fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 730fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 731fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t cesLengthBeforeExtension = cesLength; 732fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!extension.isEmpty()) { 733fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdExtension = nfd.normalize(extension, errorCode); 734fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 735fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "normalizing the relation extension"; 736fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 737fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 738fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength); 739fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 740fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_ILLEGAL_ARGUMENT_ERROR; 741fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = 742fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius "extension string adds too many collation elements (more than 31 total)"; 743fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 744fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 745fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 746fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = Collation::UNASSIGNED_CE32; 747fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((prefix != nfdPrefix || str != nfdString) && 748fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { 749fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Map from the original input to the CEs. 750fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do this in case the canonical closure is incomplete, 751fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // so that it is possible to explicitly provide the missing mappings. 752fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode); 753fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 754fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); 755fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 756fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "writing collation elements"; 757fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 758fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 759fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = cesLengthBeforeExtension; 760fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 761fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 762fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 763fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason, 764fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 765fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 766fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY); 767fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 768fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the last CE that is at least as "strong" as the requested difference. 769fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: Stronger is smaller (UCOL_PRIMARY=0). 770fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce; 771fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;; --cesLength) { 772fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength == 0) { 773fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = ces[0] = 0; 774fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = 1; 775fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 776fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 777fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce = ces[cesLength - 1]; 778fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 779fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ceStrength(ce) <= strength) { break; } 780fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 781fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 782fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTempCE(ce)) { 783fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No need to findCommonNode() here for lower levels 784fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // because insertTailoredNodeAfter() will do that anyway. 785fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return indexFromTempCE(ce); 786fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 787fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 788fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // root CE 789fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) { 790fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_UNSUPPORTED_ERROR; 791fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "tailoring relative to an unassigned code point not supported"; 792fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 0; 793fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 794fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return findOrInsertNodeForRootCE(ce, strength, errorCode); 795fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 796fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 797fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 798fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) { 799fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 800fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE); 801fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 802fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find or insert the node for each of the root CE's weights, 803fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // down to the requested level/strength. 804fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 805fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT((ce & 0xc0) == 0); 8061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode); 807fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_SECONDARY) { 808fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t lower32 = (uint32_t)ce; 809fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode); 810fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_TERTIARY) { 811fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK, 812fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_TERTIARY, errorCode); 813fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 814fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 815fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 816fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 817fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 818fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace { 819fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 820fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/** 821fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Like Java Collections.binarySearch(List, key, Comparator). 822fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 823fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * @return the index>=0 where the item was found, 824fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * or the index<0 for inserting the string at ~index in sorted order 825fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * (index into rootPrimaryIndexes) 826fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 827fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 828fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusbinarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length, 829fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t *nodes, uint32_t p) { 830fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(length == 0) { return ~0; } 831fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t start = 0; 832fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t limit = length; 833fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for (;;) { 834fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = (start + limit) / 2; 835fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes[rootPrimaryIndexes[i]]; 836fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node) 837fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (p == nodePrimary) { 838fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return i; 839fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if (p < nodePrimary) { 840fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (i == start) { 841fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ~start; // insert s before i 842fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 843fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius limit = i; 844fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 845fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if (i == start) { 846fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ~(start + 1); // insert s after i 847fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 848fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius start = i; 849fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 850fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 851fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 852fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 853fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} // namespace 854fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 855fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 856fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) { 857fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 858fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 859fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t rootIndex = binarySearchForRootPrimaryNode( 860fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 861fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(rootIndex >= 0) { 862fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return rootPrimaryIndexes.elementAti(rootIndex); 863fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 864fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Start a new list of nodes with this primary. 865fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t index = nodes.size(); 866fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.addElement(nodeFromWeight32(p), errorCode); 867fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode); 868fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 869fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 870fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 871fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 872fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 873fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) { 874fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 875fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(0 <= index && index < nodes.size()); 8761b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY); 877fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 878fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(weight16 == Collation::COMMON_WEIGHT16) { 879fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return findCommonNode(index, level); 880fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 8811b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 8821b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // If this will be the first below-common weight for the parent node, 8831b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // then we will also need to insert a common weight after it. 8841b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int64_t node = nodes.elementAti(index); 8851b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert U_ASSERT(strengthFromNode(node) < level); // parent node is stronger 8861b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) { 8871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3; 8881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if((node & hasThisLevelBefore) == 0) { 8891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // The parent node has an implied level-common weight. 8901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int64_t commonNode = 8911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level); 8921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(level == UCOL_SECONDARY) { 8931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Move the HAS_BEFORE3 flag from the parent node 8941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // to the new secondary common node. 8951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert commonNode |= node & HAS_BEFORE3; 8961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert node &= ~(int64_t)HAS_BEFORE3; 8971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 8981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert nodes.setElementAt(node | hasThisLevelBefore, index); 8991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Insert below-common-weight node. 9001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert int32_t nextIndex = nextIndexFromNode(node); 9011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert node = nodeFromWeight16(weight16) | nodeFromStrength(level); 9021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert index = insertNodeBetween(index, nextIndex, node, errorCode); 9031b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Insert common-weight node. 9041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert insertNodeBetween(index, nextIndex, commonNode, errorCode); 9051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert // Return index of below-common-weight node. 9061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert return index; 9071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 9081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } 9091b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 910fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Find the root CE's weight for this level. 911fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Postpone insertion if not found: 912fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert the new root node before the next stronger node, 913fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // or before the next root node with the same strength and a larger weight. 914fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex; 915fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((nextIndex = nextIndexFromNode(node)) != 0) { 916fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(nextIndex); 917fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextStrength = strengthFromNode(node); 918fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextStrength <= level) { 919fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert before a stronger node. 920fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextStrength < level) { break; } 921fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // nextStrength == level 922fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!isTailoredNode(node)) { 923fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t nextWeight16 = weight16FromNode(node); 924fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextWeight16 == weight16) { 925fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Found the node for the root CE up to this level. 926fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return nextIndex; 927fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 928fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert before a node with a larger same-strength weight. 929fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextWeight16 > weight16) { break; } 930fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 931fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 932fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Skip the next node. 933fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndex; 934fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 935fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodeFromWeight16(weight16) | nodeFromStrength(level); 936fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return insertNodeBetween(index, nextIndex, node, errorCode); 937fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 938fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 939fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 940fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) { 941fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 942fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(0 <= index && index < nodes.size()); 943fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_SECONDARY) { 944fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findCommonNode(index, UCOL_SECONDARY); 945fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength >= UCOL_TERTIARY) { 946fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = findCommonNode(index, UCOL_TERTIARY); 947fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 948fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 949fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Postpone insertion: 950fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Insert the new node before the next one with a strength at least as strong. 951fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 952fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex; 953fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((nextIndex = nextIndexFromNode(node)) != 0) { 954fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(nextIndex); 955fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) <= strength) { break; } 956fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Skip the next node which has a weaker (larger) strength than the new one. 957fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndex; 958fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 959fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = IS_TAILORED | nodeFromStrength(strength); 960fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return insertNodeBetween(index, nextIndex, node, errorCode); 961fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 962fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 963fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 964fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node, 965fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 966fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 967fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(previousIndexFromNode(node) == 0); 968fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nextIndexFromNode(node) == 0); 969fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 970fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Append the new node and link it to the existing nodes. 971fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t newIndex = nodes.size(); 972fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 973fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.addElement(node, errorCode); 974fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return 0; } 975fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // nodes[index].nextIndex = newIndex 976fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 977fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 978fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // nodes[nextIndex].previousIndex = newIndex 979fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nextIndex != 0) { 980fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(nextIndex); 981fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 982fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 983fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return newIndex; 984fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 985fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 986fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 987fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findCommonNode(int32_t index, int32_t strength) const { 988fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY); 989fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodes.elementAti(index); 990fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) >= strength) { 991fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The current node is no stronger. 992fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 993fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 994fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 995fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The current node implies the strength-common weight. 996fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 997fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 998fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(node); 999fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 1000fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && 10011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert weight16FromNode(node) < Collation::COMMON_WEIGHT16); 1002fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Skip to the explicit common node. 1003fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius do { 1004fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius index = nextIndexFromNode(node); 1005fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodes.elementAti(index); 1006fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(strengthFromNode(node) >= strength); 10071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } while(isTailoredNode(node) || strengthFromNode(node) > strength || 10081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert weight16FromNode(node) < Collation::COMMON_WEIGHT16); 1009fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16); 1010fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return index; 1011fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1012fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1013fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1014fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::setCaseBits(const UnicodeString &nfdString, 1015fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const char *&parserErrorReason, UErrorCode &errorCode) { 1016fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1017fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t numTailoredPrimaries = 0; 1018fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < cesLength; ++i) { 1019fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; } 1020fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1021fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We should not be able to get too many case bits because 1022fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // cesLength<=31==MAX_EXPANSION_LENGTH. 1023fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // 31 pairs of case bits fit into an int64_t without setting its sign bit. 1024fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(numTailoredPrimaries <= 31); 1025fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1026fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t cases = 0; 1027fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(numTailoredPrimaries > 0) { 1028fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UChar *s = nfdString.getBuffer(); 1029fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length()); 1030fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1; 1031fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 1032fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "fetching root CEs for tailored string"; 1033fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1034fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1035fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE); 1036fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1037fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t lastCase = 0; 1038fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t numBasePrimaries = 0; 1039fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < baseCEsLength; ++i) { 1040fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = baseCEs.getCE(i); 1041fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((ce >> 32) != 0) { 1042fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++numBasePrimaries; 1043fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t c = ((uint32_t)ce >> 14) & 3; 1044fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 1045fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(numBasePrimaries < numTailoredPrimaries) { 1046fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cases |= (int64_t)c << ((numBasePrimaries - 1) * 2); 1047fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(numBasePrimaries == numTailoredPrimaries) { 1048fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius lastCase = c; 1049fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(c != lastCase) { 1050fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // There are more base primary CEs than tailored primaries. 1051fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Set mixed case if the case bits of the remainder differ. 1052fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius lastCase = 1; 1053fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Nothing more can change. 1054fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1055fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1056fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1057fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1058fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(numBasePrimaries >= numTailoredPrimaries) { 1059fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2); 1060fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1061fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1062fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1063fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < cesLength; ++i) { 1064fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits 1065fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t strength = ceStrength(ce); 1066fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_PRIMARY) { 1067fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce |= (cases & 3) << 14; 1068fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cases >>= 2; 1069fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(strength == UCOL_TERTIARY) { 1070fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Tertiary CEs must have uppercase bits. 1071fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // See the LDML spec, and comments in class CollationCompare. 1072fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce |= 0x8000; 1073fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1074fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Tertiary ignorable CEs must have 0 case bits. 1075fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We set 0 case bits for secondary CEs too 1076fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // since currently only U+0345 is cased and maps to a secondary CE, 1077fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and it is lowercase. Other secondaries are uncased. 1078fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 1079fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ces[i] = ce; 1080fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1081fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1082fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1083fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1084fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason, 1085fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1086fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1087fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->suppressContractions(set, errorCode); 1088fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { 1089fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius parserErrorReason = "application of [suppressContractions [set]] failed"; 1090fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1091fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1092fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1093fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1094fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */, 1095fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1096fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1097fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius optimizeSet.addAll(set); 1098fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1099fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Map from the NFD input to the CEs. 1105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addTailComposites(nfdPrefix, nfdString, errorCode); 1108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 1109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 1118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfdPrefix.isEmpty()) { 1119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CanonicalIterator stringIter(nfdString, errorCode); 1120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString prefix; 1122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString str = stringIter.next(); 1124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(str.isBogus()) { break; } 1125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ignoreString(str, errorCode) || str == nfdString) { continue; } 1126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CanonicalIterator prefixIter(nfdPrefix, errorCode); 1131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CanonicalIterator stringIter(nfdString, errorCode); 1132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString prefix = prefixIter.next(); 1135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(prefix.isBogus()) { break; } 1136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ignorePrefix(prefix, errorCode)) { continue; } 1137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool samePrefix = prefix == nfdPrefix; 1138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString str = stringIter.next(); 1140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(str.isBogus()) { break; } 1141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; } 1142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius stringIter.reset(); 1146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 1149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1151fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Look for the last starter in the NFD string. 1157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 lastStarter; 1158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t indexAfterLastStarter = nfdString.length(); 1159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(indexAfterLastStarter == 0) { return; } // no starter at all 1161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius lastStarter = nfdString.char32At(indexAfterLastStarter - 1); 1162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfd.getCombiningClass(lastStarter) == 0) { break; } 1163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius indexAfterLastStarter -= U16_LENGTH(lastStarter); 1164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // No closure to Hangul syllables since we decompose them on the fly. 1166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(Hangul::isJamoL(lastStarter)) { return; } 1167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Are there any composites whose decomposition starts with the lastStarter? 1169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 1170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We might find some more equivalent mappings here if it did. 1171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet composites; 1172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 1173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString decomp; 1175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString newNFDString, newString; 1176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t newCEs[Collation::MAX_EXPANSION_LENGTH]; 1177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(composites); 1178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(iter.next()) { 1179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!iter.isString()); 1180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 composite = iter.getCodepoint(); 1181fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfd.getDecomposition(composite, decomp); 1182fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 1183fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString, newString, errorCode)) { 1184fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 1185fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1186fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0); 1187fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { 1188fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Ignore mappings that we cannot store. 1189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 1190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Note: It is possible that the newCEs do not make use of the mapping 1192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // for which we are adding the tail composites, in which case we might be adding 1193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // unnecessary mappings. 1194fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // For example, when we add tail composites for ae^ (^=combining circumflex), 1195fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // UCA discontiguous-contraction matching does not find any matches 1196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // for ae_^ (_=any combining diacritic below) *unless* there is also 1197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // a contraction mapping for ae. 1198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Thus, if there is no ae contraction, then the ae^ mapping is ignored 1199fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // while fetching the newCEs for ae_^. 1200fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // TODO: Try to detect this effectively. 1201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (Alternatively, print a warning when prefix contractions are missing.) 1202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We do not need an explicit mapping for the NFD strings. 1204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It is fine if the NFD input collates like this via a sequence of mappings. 1205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // It also saves a little bit of space, and may reduce the set of characters with contractions. 1206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t ce32 = addIfDifferent(nfdPrefix, newString, 1207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode); 1208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 != Collation::UNASSIGNED_CE32) { 1209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // was different, was added 1210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode); 1211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString, 1217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t indexAfterLastStarter, 1218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 composite, const UnicodeString &decomp, 1219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString &newNFDString, UnicodeString &newString, 1220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) const { 1221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return FALSE; } 1222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0)); 1223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t lastStarterLength = decomp.moveIndex32(0, 1); 1224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(lastStarterLength == decomp.length()) { 1225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Singleton decompositions should be found by addWithClosure() 1226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and the CanonicalIterator, so we can ignore them here. 1227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(nfdString.compare(indexAfterLastStarter, 0x7fffffff, 1230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decomp, lastStarterLength, 0x7fffffff) == 0) { 1231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // same strings, nothing new to be found here 1232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Make new FCD strings that combine a composite, or its decomposition, 1236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // into the nfdString's last starter and the combining marks following it. 1237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Make an NFD version, and a version with the composite. 1238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.setTo(nfdString, 0, indexAfterLastStarter); 1239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite); 1240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The following is related to discontiguous contraction matching, 1242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // but builds only FCD strings (or else returns FALSE). 1243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t sourceIndex = indexAfterLastStarter; 1244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t decompIndex = lastStarterLength; 1245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Small optimization: We keep the source character across loop iterations 1246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // because we do not always consume it, 1247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // and then need not fetch it again nor look up its combining class again. 1248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 sourceChar = U_SENTINEL; 1249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The cc variables need to be declared before the loop so that at the end 1250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // they are set to the last combining classes seen. 1251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t sourceCC = 0; 1252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint8_t decompCC = 0; 1253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sourceChar < 0) { 1255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sourceIndex >= nfdString.length()) { break; } 1256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sourceChar = nfdString.char32At(sourceIndex); 1257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sourceCC = nfd.getCombiningClass(sourceChar); 1258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(sourceCC != 0); 1259fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1260fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We consume a decomposition character in each iteration. 1261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(decompIndex >= decomp.length()) { break; } 1262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 decompChar = decomp.char32At(decompIndex); 1263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decompCC = nfd.getCombiningClass(decompChar); 1264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Compare the two characters and their combining classes. 1265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(decompCC == 0) { 1266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Unable to merge because the source contains a non-zero combining mark 1267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // but the composite's decomposition contains another starter. 1268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // The strings would not be equivalent. 1269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(sourceCC < decompCC) { 1271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Composite + sourceChar would not be FCD. 1272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(decompCC < sourceCC) { 1274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.append(decompChar); 1275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decompIndex += U16_LENGTH(decompChar); 1276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(decompChar != sourceChar) { 1277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Blocked because same combining class. 1278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { // match: decompChar == sourceChar 1280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.append(decompChar); 1281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius decompIndex += U16_LENGTH(decompChar); 1282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sourceIndex += U16_LENGTH(decompChar); 1283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sourceChar = U_SENTINEL; 1284fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We are at the end of at least one of the two inputs. 1287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sourceChar >= 0) { // more characters from nfdString but not from decomp 1288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(sourceCC < decompCC) { 1289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Appending the next source character to the composite would not be FCD. 1290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.append(nfdString, sourceIndex, 0x7fffffff); 1293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newString.append(nfdString, sourceIndex, 0x7fffffff); 1294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 1295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newNFDString.append(decomp, decompIndex, 0x7fffffff); 1296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nfd.isNormalized(newNFDString, errorCode)); 1298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(fcd.isNormalized(newString, errorCode)); 1299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent 1300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 1301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const { 1305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not map non-FCD prefixes. 1306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return !isFCD(s, errorCode); 1307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const { 1311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not map non-FCD strings. 1312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not map strings that start with Hangul syllables: We decompose those on the fly. 1313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0)); 1314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const { 1318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode); 1319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::closeOverComposites(UErrorCode &errorCode) { 1323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final 1324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Hangul is decomposed on the fly during collation. 1326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 1327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString prefix; // empty 1328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString nfdString; 1329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSetIterator iter(composites); 1330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(iter.next()) { 1331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!iter.isString()); 1332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nfd.getDecomposition(iter.getCodepoint(), nfdString); 1333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius cesLength = dataBuilder->getCEs(nfdString, ces, 0); 1334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 1335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Too many CEs from the decomposition (unusual), ignore this composite. 1336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We could add a capacity parameter to getCEs() and reallocate if necessary. 1337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // However, this can only really happen in contrived cases. 1338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius continue; 1339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UnicodeString &composite(iter.getString()); 1341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode); 1342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str, 1347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode &errorCode) { 1349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return ce32; } 1350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH]; 1351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0); 1352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 1353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ce32 == Collation::UNASSIGNED_CE32) { 1354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode); 1355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder->addCE32(prefix, str, ce32, errorCode); 1357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 1359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool 1362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length, 1363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t ces2[], int32_t ces2Length) { 1364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ces1Length != ces2Length) { 1365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return FALSE; 1366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH); 1368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t i = 0; i < ces1Length; ++i) { 1369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(ces1[i] != ces2[i]) { return FALSE; } 1370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return TRUE; 1372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t 1377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusalignWeightRight(uint32_t w) { 1378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(w != 0) { 1379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while((w & 0xff) == 0) { w >>= 8; } 1380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return w; 1382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { 1388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CollationWeights primaries, secondaries, tertiaries; 1391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t *nodesArray = nodes.getBuffer(); 13921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#ifdef DEBUG_COLLATION_BUILDER 13931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert puts("\nCollationBuilder::makeTailoredCEs()"); 13941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif 1395fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1396fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 1397fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = rootPrimaryIndexes.elementAti(rpi); 1398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodesArray[i]; 1399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t p = weight32FromNode(node); 1400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16; 1401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t t = s; 1402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t q = 0; 1403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool pIsTailored = FALSE; 1404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool sIsTailored = FALSE; 1405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool tIsTailored = FALSE; 1406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf("\nprimary %lx\n", (long)alignWeightRight(p)); 1408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 1410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t nextIndex = nextIndexFromNode(node); 1411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(nextIndex != 0) { 1412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius i = nextIndex; 1413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius node = nodesArray[i]; 1414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nextIndex = nextIndexFromNode(node); 1415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t strength = strengthFromNode(node); 1416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_QUATERNARY) { 1417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(node)); 1418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" quat+ "); 1420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(q == 3) { 1422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "quaternary tailoring gap too small"; 1424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++q; 1427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_TERTIARY) { 1429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 1430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" ter+ "); 1432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!tIsTailored) { 1434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // First tailored tertiary node for [p, s]. 1435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t tCount = countTailoredNodes(nodesArray, nextIndex, 1436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_TERTIARY) + 1; 1437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t tLimit; 1438fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(t == 0) { 1439fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Gap at the beginning of the tertiary CE range. 1440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t = rootElements.getTertiaryBoundary() - 0x100; 1441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK; 1442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(!pIsTailored && !sIsTailored) { 1443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // p and s are root weights. 1444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 14451b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else if(t == Collation::BEFORE_WEIGHT16) { 14461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert tLimit = Collation::COMMON_WEIGHT16; 1447fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1448fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // [p, s] is tailored. 1449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(t == Collation::COMMON_WEIGHT16); 1450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tLimit = rootElements.getTertiaryBoundary(); 1451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1452fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0); 1453fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tertiaries.initForTertiary(); 1454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!tertiaries.allocWeights(t, tLimit, tCount)) { 1455fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1456fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "tertiary tailoring gap too small"; 1457fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1458fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1459fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tIsTailored = TRUE; 1460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t = tertiaries.nextWeight(); 1462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(t != 0xffffffff); 1463fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t = weight16FromNode(node); 1465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tIsTailored = FALSE; 1466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" ter %lx\n", (long)alignWeightRight(t)); 1468fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strength == UCOL_SECONDARY) { 1472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 1473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" sec+ "); 1475fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1476fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!sIsTailored) { 1477fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // First tailored secondary node for p. 1478fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t sCount = countTailoredNodes(nodesArray, nextIndex, 1479fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_SECONDARY) + 1; 1480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t sLimit; 1481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(s == 0) { 1482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Gap at the beginning of the secondary CE range. 1483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = rootElements.getSecondaryBoundary() - 0x100; 1484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sLimit = rootElements.getFirstSecondaryCE() >> 16; 1485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else if(!pIsTailored) { 1486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // p is a root primary. 1487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sLimit = rootElements.getSecondaryAfter(pIndex, s); 14881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert } else if(s == Collation::BEFORE_WEIGHT16) { 14891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert sLimit = Collation::COMMON_WEIGHT16; 1490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // p is a tailored primary. 1492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(s == Collation::COMMON_WEIGHT16); 1493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sLimit = rootElements.getSecondaryBoundary(); 1494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(s == Collation::COMMON_WEIGHT16) { 1496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Do not tailor into the getSortKey() range of 1497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // compressed common secondaries. 1498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = rootElements.getLastCommonSecondary(); 1499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius secondaries.initForSecondary(); 1501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!secondaries.allocWeights(s, sLimit, sCount)) { 1502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; 1503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "secondary tailoring gap too small"; 15041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#ifdef DEBUG_COLLATION_BUILDER 15051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n", 15061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert (long)alignWeightRight(s), (long)alignWeightRight(sLimit), 15071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert (long)alignWeightRight(sCount)); 15081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif 1509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sIsTailored = TRUE; 1512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = secondaries.nextWeight(); 1514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(s != 0xffffffff); 1515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = weight16FromNode(node); 1517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sIsTailored = FALSE; 1518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf(" sec %lx\n", (long)alignWeightRight(s)); 1520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else /* UCOL_PRIMARY */ { 1523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(isTailoredNode(node)); 1524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf("pri+ "); 1526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!pIsTailored) { 1528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // First tailored primary node in this list. 1529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t pCount = countTailoredNodes(nodesArray, nextIndex, 1530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_PRIMARY) + 1; 1531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool isCompressible = baseData->isCompressiblePrimary(p); 1532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t pLimit = 1533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius rootElements.getPrimaryAfter(p, pIndex, isCompressible); 1534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius primaries.initForPrimary(isCompressible); 1535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(!primaries.allocWeights(p, pLimit, pCount)) { 1536fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: introduce a more specific UErrorCode? 1537fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorReason = "primary tailoring gap too small"; 1538fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1539fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1540fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius pIsTailored = TRUE; 1541fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1542fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius p = primaries.nextWeight(); 1543fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(p != 0xffffffff); 1544fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius s = Collation::COMMON_WEIGHT16; 1545fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sIsTailored = FALSE; 1546fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1547fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius t = s == 0 ? 0 : Collation::COMMON_WEIGHT16; 1548fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tIsTailored = FALSE; 1549fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1550fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius q = 0; 1551fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1552fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 1553fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius nodesArray[i] = Collation::makeCE(p, s, t, q); 1554fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER 1555fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius printf("%016llx\n", (long long)nodesArray[i]); 1556fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif 1557fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1558fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1559fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1561fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1562fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 1563fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) { 1564fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t count = 0; 1565fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(;;) { 1566fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(i == 0) { break; } 1567fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t node = nodesArray[i]; 1568fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) < strength) { break; } 1569fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(strengthFromNode(node) == strength) { 1570fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(isTailoredNode(node)) { 1571fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ++count; 1572fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1573fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius break; 1574fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1575fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1576fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius i = nextIndexFromNode(node); 1577fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1578fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return count; 1579fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1580fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1581fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass CEFinalizer : public CollationDataBuilder::CEModifier { 1582fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 1583fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CEFinalizer(const int64_t *ces) : finalCEs(ces) {} 1584fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual ~CEFinalizer(); 1585fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual int64_t modifyCE32(uint32_t ce32) const { 1586fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U_ASSERT(!Collation::isSpecialCE32(ce32)); 1587fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationBuilder::isTempCE32(ce32)) { 1588fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // retain case bits 1589fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 1590fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1591fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::NO_CE; 1592fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1593fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1594fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual int64_t modifyCE(int64_t ce) const { 1595fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(CollationBuilder::isTempCE(ce)) { 1596fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // retain case bits 1597fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000); 1598fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 1599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return Collation::NO_CE; 1600fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1601fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1602fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1603fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate: 1604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const int64_t *finalCEs; 1605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 1606fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1607fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCEFinalizer::~CEFinalizer() {} 1608fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1609fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid 1610fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::finalizeCEs(UErrorCode &errorCode) { 1611fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 16121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode); 16131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert if(U_FAILURE(errorCode)) { 1614fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return; 1615fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1616fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newBuilder->initForTailoring(baseData, errorCode); 1617fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CEFinalizer finalizer(nodes.getBuffer()); 1618fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius newBuilder->copyFrom(*dataBuilder, finalizer, errorCode); 1619fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(errorCode)) { return; } 1620fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete dataBuilder; 1621fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius dataBuilder = newBuilder.orphan(); 1622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1623fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1624fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t 1625fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ceStrength(int64_t ce) { 1626fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return 1627fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius isTempCE(ce) ? strengthFromTempCE(ce) : 1628fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY : 1629fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY : 1630fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce != 0 ? UCOL_TERTIARY : 1631fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UCOL_IDENTICAL; 1632fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1633fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1634fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END 1635fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1636fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_USE 1637fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1638fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CAPI UCollator * U_EXPORT2 1639fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusucol_openRules(const UChar *rules, int32_t rulesLength, 1640fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UColAttributeValue normalizationMode, UCollationStrength strength, 1641fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UParseError *parseError, UErrorCode *pErrorCode) { 1642fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(*pErrorCode)) { return NULL; } 1643fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(rules == NULL && rulesLength != 0) { 1644fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1645fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 1646fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1647fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius RuleBasedCollator *coll = new RuleBasedCollator(); 1648fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(coll == NULL) { 1649fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 1650fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 1651fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1652fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength); 1653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode); 1654fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(U_FAILURE(*pErrorCode)) { 1655fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius delete coll; 1656fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return NULL; 1657fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1658fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return coll->toUCollator(); 1659fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1660fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1661fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic const int32_t internalBufferSize = 512; 1662fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1663fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp 1664fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// because it calls UnicodeSet "builder" code that depends on all Unicode properties, 1665fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// and the rest of the collation "runtime" code only depends on normalization. 1666fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// This function is not related to the collation builder, 1667fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// but it did not seem worth moving it into its own .cpp file, 1668fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods. 1669fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CAPI int32_t U_EXPORT2 1670fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusucol_getUnsafeSet( const UCollator *coll, 1671fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius USet *unsafe, 1672fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode *status) 1673fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{ 1674fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar buffer[internalBufferSize]; 1675fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t len = 0; 1676fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1677fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_clear(unsafe); 1678fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1679fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 1680fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 1681fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 1682fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1683fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // add chars that fail the fcd check 1684fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 1685fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1686fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // add lead/trail surrogates 1687fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*, 1688fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // not when testing code *points*) 1689fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_addRange(unsafe, 0xd800, 0xdfff); 1690fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1691fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius USet *contractions = uset_open(0,0); 1692fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1693fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t i = 0, j = 0; 1694fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 1695fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t contsSize = uset_size(contractions); 1696fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UChar32 c = 0; 1697fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Contraction set consists only of strings 1698fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // to get unsafe code points, we need to 1699fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // break the strings apart and add them to the unsafe set 1700fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius for(i = 0; i < contsSize; i++) { 1701fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 1702fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(len > 0) { 1703fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius j = 0; 1704fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius while(j < len) { 1705fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U16_NEXT(buffer, j, len, c); 1706fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(j < len) { 1707fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_add(unsafe, c); 1708fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1709fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1710fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1711fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 1712fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1713fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uset_close(contractions); 1714fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1715fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return uset_size(unsafe); 1716fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius} 1717fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 1718fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif // !UCONFIG_NO_COLLATION 1719